Import libgav1_0.19.0.orig.tar.xz
authorBoyuan Yang <byang@debian.org>
Tue, 28 Nov 2023 04:17:46 +0000 (23:17 -0500)
committerBoyuan Yang <byang@debian.org>
Tue, 28 Nov 2023 04:17:46 +0000 (23:17 -0500)
[dgit import orig libgav1_0.19.0.orig.tar.xz]

387 files changed:
.cmake-format.py [new file with mode: 0644]
.gitattributes [new file with mode: 0644]
.gitignore [new file with mode: 0644]
AUTHORS [new file with mode: 0644]
CMakeLists.txt [new file with mode: 0644]
CONTRIBUTING.md [new file with mode: 0644]
LICENSE [new file with mode: 0644]
README.md [new file with mode: 0644]
cmake/libgav1-config.cmake.template [new file with mode: 0644]
cmake/libgav1.pc.template [new file with mode: 0644]
cmake/libgav1_build_definitions.cmake [new file with mode: 0644]
cmake/libgav1_cpu_detection.cmake [new file with mode: 0644]
cmake/libgav1_flags.cmake [new file with mode: 0644]
cmake/libgav1_helpers.cmake [new file with mode: 0644]
cmake/libgav1_install.cmake [new file with mode: 0644]
cmake/libgav1_intrinsics.cmake [new file with mode: 0644]
cmake/libgav1_options.cmake [new file with mode: 0644]
cmake/libgav1_sanitizer.cmake [new file with mode: 0644]
cmake/libgav1_targets.cmake [new file with mode: 0644]
cmake/libgav1_variables.cmake [new file with mode: 0644]
cmake/toolchains/aarch64-linux-gnu.cmake [new file with mode: 0644]
cmake/toolchains/android.cmake [new file with mode: 0644]
cmake/toolchains/arm-linux-gnueabihf.cmake [new file with mode: 0644]
codereview.settings [new file with mode: 0644]
examples/file_reader.cc [new file with mode: 0644]
examples/file_reader.h [new file with mode: 0644]
examples/file_reader_constants.cc [new file with mode: 0644]
examples/file_reader_constants.h [new file with mode: 0644]
examples/file_reader_factory.cc [new file with mode: 0644]
examples/file_reader_factory.h [new file with mode: 0644]
examples/file_reader_factory_test.cc [new file with mode: 0644]
examples/file_reader_interface.h [new file with mode: 0644]
examples/file_reader_test.cc [new file with mode: 0644]
examples/file_reader_test_common.cc [new file with mode: 0644]
examples/file_reader_test_common.h [new file with mode: 0644]
examples/file_writer.cc [new file with mode: 0644]
examples/file_writer.h [new file with mode: 0644]
examples/file_writer_test.cc [new file with mode: 0644]
examples/gav1_decode.cc [new file with mode: 0644]
examples/gav1_decode_cv_pixel_buffer_pool.cc [new file with mode: 0644]
examples/gav1_decode_cv_pixel_buffer_pool.h [new file with mode: 0644]
examples/ivf_parser.cc [new file with mode: 0644]
examples/ivf_parser.h [new file with mode: 0644]
examples/libgav1_examples.cmake [new file with mode: 0644]
examples/logging.h [new file with mode: 0644]
src/buffer_pool.cc [new file with mode: 0644]
src/buffer_pool.h [new file with mode: 0644]
src/buffer_pool_test.cc [new file with mode: 0644]
src/c_decoder_test.c [new file with mode: 0644]
src/c_version_test.c [new file with mode: 0644]
src/decoder.cc [new file with mode: 0644]
src/decoder_buffer_test.cc [new file with mode: 0644]
src/decoder_impl.cc [new file with mode: 0644]
src/decoder_impl.h [new file with mode: 0644]
src/decoder_settings.cc [new file with mode: 0644]
src/decoder_state.h [new file with mode: 0644]
src/decoder_test.cc [new file with mode: 0644]
src/decoder_test_data.h [new file with mode: 0644]
src/dsp/arm/average_blend_neon.cc [new file with mode: 0644]
src/dsp/arm/average_blend_neon.h [new file with mode: 0644]
src/dsp/arm/cdef_neon.cc [new file with mode: 0644]
src/dsp/arm/cdef_neon.h [new file with mode: 0644]
src/dsp/arm/common_neon.h [new file with mode: 0644]
src/dsp/arm/common_neon_test.cc [new file with mode: 0644]
src/dsp/arm/convolve_10bit_neon.cc [new file with mode: 0644]
src/dsp/arm/convolve_neon.cc [new file with mode: 0644]
src/dsp/arm/convolve_neon.h [new file with mode: 0644]
src/dsp/arm/distance_weighted_blend_neon.cc [new file with mode: 0644]
src/dsp/arm/distance_weighted_blend_neon.h [new file with mode: 0644]
src/dsp/arm/film_grain_neon.cc [new file with mode: 0644]
src/dsp/arm/film_grain_neon.h [new file with mode: 0644]
src/dsp/arm/intra_edge_neon.cc [new file with mode: 0644]
src/dsp/arm/intra_edge_neon.h [new file with mode: 0644]
src/dsp/arm/intrapred_cfl_neon.cc [new file with mode: 0644]
src/dsp/arm/intrapred_cfl_neon.h [new file with mode: 0644]
src/dsp/arm/intrapred_directional_neon.cc [new file with mode: 0644]
src/dsp/arm/intrapred_directional_neon.h [new file with mode: 0644]
src/dsp/arm/intrapred_filter_neon.cc [new file with mode: 0644]
src/dsp/arm/intrapred_filter_neon.h [new file with mode: 0644]
src/dsp/arm/intrapred_neon.cc [new file with mode: 0644]
src/dsp/arm/intrapred_neon.h [new file with mode: 0644]
src/dsp/arm/intrapred_smooth_neon.cc [new file with mode: 0644]
src/dsp/arm/intrapred_smooth_neon.h [new file with mode: 0644]
src/dsp/arm/inverse_transform_10bit_neon.cc [new file with mode: 0644]
src/dsp/arm/inverse_transform_neon.cc [new file with mode: 0644]
src/dsp/arm/inverse_transform_neon.h [new file with mode: 0644]
src/dsp/arm/loop_filter_10bit_neon.cc [new file with mode: 0644]
src/dsp/arm/loop_filter_neon.cc [new file with mode: 0644]
src/dsp/arm/loop_filter_neon.h [new file with mode: 0644]
src/dsp/arm/loop_restoration_10bit_neon.cc [new file with mode: 0644]
src/dsp/arm/loop_restoration_neon.cc [new file with mode: 0644]
src/dsp/arm/loop_restoration_neon.h [new file with mode: 0644]
src/dsp/arm/mask_blend_neon.cc [new file with mode: 0644]
src/dsp/arm/mask_blend_neon.h [new file with mode: 0644]
src/dsp/arm/motion_field_projection_neon.cc [new file with mode: 0644]
src/dsp/arm/motion_field_projection_neon.h [new file with mode: 0644]
src/dsp/arm/motion_vector_search_neon.cc [new file with mode: 0644]
src/dsp/arm/motion_vector_search_neon.h [new file with mode: 0644]
src/dsp/arm/obmc_neon.cc [new file with mode: 0644]
src/dsp/arm/obmc_neon.h [new file with mode: 0644]
src/dsp/arm/super_res_neon.cc [new file with mode: 0644]
src/dsp/arm/super_res_neon.h [new file with mode: 0644]
src/dsp/arm/warp_neon.cc [new file with mode: 0644]
src/dsp/arm/warp_neon.h [new file with mode: 0644]
src/dsp/arm/weight_mask_neon.cc [new file with mode: 0644]
src/dsp/arm/weight_mask_neon.h [new file with mode: 0644]
src/dsp/average_blend.cc [new file with mode: 0644]
src/dsp/average_blend.h [new file with mode: 0644]
src/dsp/average_blend_test.cc [new file with mode: 0644]
src/dsp/cdef.cc [new file with mode: 0644]
src/dsp/cdef.h [new file with mode: 0644]
src/dsp/cdef.inc [new file with mode: 0644]
src/dsp/cdef_test.cc [new file with mode: 0644]
src/dsp/common.h [new file with mode: 0644]
src/dsp/common_dsp_test.cc [new file with mode: 0644]
src/dsp/constants.cc [new file with mode: 0644]
src/dsp/constants.h [new file with mode: 0644]
src/dsp/convolve.cc [new file with mode: 0644]
src/dsp/convolve.h [new file with mode: 0644]
src/dsp/convolve.inc [new file with mode: 0644]
src/dsp/convolve_test.cc [new file with mode: 0644]
src/dsp/distance_weighted_blend.cc [new file with mode: 0644]
src/dsp/distance_weighted_blend.h [new file with mode: 0644]
src/dsp/distance_weighted_blend_test.cc [new file with mode: 0644]
src/dsp/dsp.cc [new file with mode: 0644]
src/dsp/dsp.h [new file with mode: 0644]
src/dsp/dsp_test.cc [new file with mode: 0644]
src/dsp/film_grain.cc [new file with mode: 0644]
src/dsp/film_grain.h [new file with mode: 0644]
src/dsp/film_grain_common.h [new file with mode: 0644]
src/dsp/intra_edge.cc [new file with mode: 0644]
src/dsp/intra_edge.h [new file with mode: 0644]
src/dsp/intra_edge_test.cc [new file with mode: 0644]
src/dsp/intrapred.cc [new file with mode: 0644]
src/dsp/intrapred.h [new file with mode: 0644]
src/dsp/intrapred_cfl.cc [new file with mode: 0644]
src/dsp/intrapred_cfl.h [new file with mode: 0644]
src/dsp/intrapred_cfl_test.cc [new file with mode: 0644]
src/dsp/intrapred_directional.cc [new file with mode: 0644]
src/dsp/intrapred_directional.h [new file with mode: 0644]
src/dsp/intrapred_directional_test.cc [new file with mode: 0644]
src/dsp/intrapred_filter.cc [new file with mode: 0644]
src/dsp/intrapred_filter.h [new file with mode: 0644]
src/dsp/intrapred_filter_test.cc [new file with mode: 0644]
src/dsp/intrapred_smooth.cc [new file with mode: 0644]
src/dsp/intrapred_smooth.h [new file with mode: 0644]
src/dsp/intrapred_test.cc [new file with mode: 0644]
src/dsp/inverse_transform.cc [new file with mode: 0644]
src/dsp/inverse_transform.h [new file with mode: 0644]
src/dsp/inverse_transform.inc [new file with mode: 0644]
src/dsp/inverse_transform_test.cc [new file with mode: 0644]
src/dsp/libgav1_dsp.cmake [new file with mode: 0644]
src/dsp/loop_filter.cc [new file with mode: 0644]
src/dsp/loop_filter.h [new file with mode: 0644]
src/dsp/loop_filter_test.cc [new file with mode: 0644]
src/dsp/loop_restoration.cc [new file with mode: 0644]
src/dsp/loop_restoration.h [new file with mode: 0644]
src/dsp/loop_restoration_test.cc [new file with mode: 0644]
src/dsp/mask_blend.cc [new file with mode: 0644]
src/dsp/mask_blend.h [new file with mode: 0644]
src/dsp/mask_blend_test.cc [new file with mode: 0644]
src/dsp/motion_field_projection.cc [new file with mode: 0644]
src/dsp/motion_field_projection.h [new file with mode: 0644]
src/dsp/motion_field_projection_test.cc [new file with mode: 0644]
src/dsp/motion_vector_search.cc [new file with mode: 0644]
src/dsp/motion_vector_search.h [new file with mode: 0644]
src/dsp/motion_vector_search_test.cc [new file with mode: 0644]
src/dsp/obmc.cc [new file with mode: 0644]
src/dsp/obmc.h [new file with mode: 0644]
src/dsp/obmc.inc [new file with mode: 0644]
src/dsp/obmc_test.cc [new file with mode: 0644]
src/dsp/smooth_weights.inc [new file with mode: 0644]
src/dsp/super_res.cc [new file with mode: 0644]
src/dsp/super_res.h [new file with mode: 0644]
src/dsp/super_res_test.cc [new file with mode: 0644]
src/dsp/warp.cc [new file with mode: 0644]
src/dsp/warp.h [new file with mode: 0644]
src/dsp/warp_test.cc [new file with mode: 0644]
src/dsp/weight_mask.cc [new file with mode: 0644]
src/dsp/weight_mask.h [new file with mode: 0644]
src/dsp/weight_mask_test.cc [new file with mode: 0644]
src/dsp/x86/average_blend_sse4.cc [new file with mode: 0644]
src/dsp/x86/average_blend_sse4.h [new file with mode: 0644]
src/dsp/x86/cdef_avx2.cc [new file with mode: 0644]
src/dsp/x86/cdef_avx2.h [new file with mode: 0644]
src/dsp/x86/cdef_sse4.cc [new file with mode: 0644]
src/dsp/x86/cdef_sse4.h [new file with mode: 0644]
src/dsp/x86/common_avx2.h [new file with mode: 0644]
src/dsp/x86/common_avx2.inc [new file with mode: 0644]
src/dsp/x86/common_avx2_test.cc [new file with mode: 0644]
src/dsp/x86/common_avx2_test.h [new file with mode: 0644]
src/dsp/x86/common_sse4.h [new file with mode: 0644]
src/dsp/x86/common_sse4.inc [new file with mode: 0644]
src/dsp/x86/common_sse4_test.cc [new file with mode: 0644]
src/dsp/x86/common_sse4_test.h [new file with mode: 0644]
src/dsp/x86/convolve_avx2.cc [new file with mode: 0644]
src/dsp/x86/convolve_avx2.h [new file with mode: 0644]
src/dsp/x86/convolve_sse4.cc [new file with mode: 0644]
src/dsp/x86/convolve_sse4.h [new file with mode: 0644]
src/dsp/x86/convolve_sse4.inc [new file with mode: 0644]
src/dsp/x86/distance_weighted_blend_sse4.cc [new file with mode: 0644]
src/dsp/x86/distance_weighted_blend_sse4.h [new file with mode: 0644]
src/dsp/x86/film_grain_sse4.cc [new file with mode: 0644]
src/dsp/x86/film_grain_sse4.h [new file with mode: 0644]
src/dsp/x86/intra_edge_sse4.cc [new file with mode: 0644]
src/dsp/x86/intra_edge_sse4.h [new file with mode: 0644]
src/dsp/x86/intrapred_cfl_sse4.cc [new file with mode: 0644]
src/dsp/x86/intrapred_cfl_sse4.h [new file with mode: 0644]
src/dsp/x86/intrapred_directional_sse4.cc [new file with mode: 0644]
src/dsp/x86/intrapred_directional_sse4.h [new file with mode: 0644]
src/dsp/x86/intrapred_filter_sse4.cc [new file with mode: 0644]
src/dsp/x86/intrapred_filter_sse4.h [new file with mode: 0644]
src/dsp/x86/intrapred_smooth_sse4.cc [new file with mode: 0644]
src/dsp/x86/intrapred_smooth_sse4.h [new file with mode: 0644]
src/dsp/x86/intrapred_sse4.cc [new file with mode: 0644]
src/dsp/x86/intrapred_sse4.h [new file with mode: 0644]
src/dsp/x86/inverse_transform_sse4.cc [new file with mode: 0644]
src/dsp/x86/inverse_transform_sse4.h [new file with mode: 0644]
src/dsp/x86/loop_filter_sse4.cc [new file with mode: 0644]
src/dsp/x86/loop_filter_sse4.h [new file with mode: 0644]
src/dsp/x86/loop_restoration_10bit_avx2.cc [new file with mode: 0644]
src/dsp/x86/loop_restoration_10bit_sse4.cc [new file with mode: 0644]
src/dsp/x86/loop_restoration_avx2.cc [new file with mode: 0644]
src/dsp/x86/loop_restoration_avx2.h [new file with mode: 0644]
src/dsp/x86/loop_restoration_sse4.cc [new file with mode: 0644]
src/dsp/x86/loop_restoration_sse4.h [new file with mode: 0644]
src/dsp/x86/mask_blend_sse4.cc [new file with mode: 0644]
src/dsp/x86/mask_blend_sse4.h [new file with mode: 0644]
src/dsp/x86/motion_field_projection_sse4.cc [new file with mode: 0644]
src/dsp/x86/motion_field_projection_sse4.h [new file with mode: 0644]
src/dsp/x86/motion_vector_search_sse4.cc [new file with mode: 0644]
src/dsp/x86/motion_vector_search_sse4.h [new file with mode: 0644]
src/dsp/x86/obmc_sse4.cc [new file with mode: 0644]
src/dsp/x86/obmc_sse4.h [new file with mode: 0644]
src/dsp/x86/super_res_sse4.cc [new file with mode: 0644]
src/dsp/x86/super_res_sse4.h [new file with mode: 0644]
src/dsp/x86/transpose_sse4.h [new file with mode: 0644]
src/dsp/x86/warp_sse4.cc [new file with mode: 0644]
src/dsp/x86/warp_sse4.h [new file with mode: 0644]
src/dsp/x86/weight_mask_sse4.cc [new file with mode: 0644]
src/dsp/x86/weight_mask_sse4.h [new file with mode: 0644]
src/film_grain.cc [new file with mode: 0644]
src/film_grain.h [new file with mode: 0644]
src/film_grain_test.cc [new file with mode: 0644]
src/frame_buffer.cc [new file with mode: 0644]
src/frame_buffer_utils.h [new file with mode: 0644]
src/frame_scratch_buffer.h [new file with mode: 0644]
src/gav1/decoder.h [new file with mode: 0644]
src/gav1/decoder_buffer.h [new file with mode: 0644]
src/gav1/decoder_settings.h [new file with mode: 0644]
src/gav1/frame_buffer.h [new file with mode: 0644]
src/gav1/status_code.h [new file with mode: 0644]
src/gav1/symbol_visibility.h [new file with mode: 0644]
src/gav1/version.h [new file with mode: 0644]
src/inter_intra_masks.inc [new file with mode: 0644]
src/internal_frame_buffer_list.cc [new file with mode: 0644]
src/internal_frame_buffer_list.h [new file with mode: 0644]
src/internal_frame_buffer_list_test.cc [new file with mode: 0644]
src/libgav1_decoder.cmake [new file with mode: 0644]
src/loop_restoration_info.cc [new file with mode: 0644]
src/loop_restoration_info.h [new file with mode: 0644]
src/motion_vector.cc [new file with mode: 0644]
src/motion_vector.h [new file with mode: 0644]
src/obu_parser.cc [new file with mode: 0644]
src/obu_parser.h [new file with mode: 0644]
src/obu_parser_test.cc [new file with mode: 0644]
src/post_filter.h [new file with mode: 0644]
src/post_filter/cdef.cc [new file with mode: 0644]
src/post_filter/deblock.cc [new file with mode: 0644]
src/post_filter/deblock_thresholds.inc [new file with mode: 0644]
src/post_filter/loop_restoration.cc [new file with mode: 0644]
src/post_filter/post_filter.cc [new file with mode: 0644]
src/post_filter/super_res.cc [new file with mode: 0644]
src/post_filter_test.cc [new file with mode: 0644]
src/prediction_mask.cc [new file with mode: 0644]
src/prediction_mask.h [new file with mode: 0644]
src/prediction_mask_test.cc [new file with mode: 0644]
src/quantizer.cc [new file with mode: 0644]
src/quantizer.h [new file with mode: 0644]
src/quantizer_tables.inc [new file with mode: 0644]
src/quantizer_test.cc [new file with mode: 0644]
src/reconstruction.cc [new file with mode: 0644]
src/reconstruction.h [new file with mode: 0644]
src/reconstruction_test.cc [new file with mode: 0644]
src/residual_buffer_pool.cc [new file with mode: 0644]
src/residual_buffer_pool.h [new file with mode: 0644]
src/residual_buffer_pool_test.cc [new file with mode: 0644]
src/scan_tables.inc [new file with mode: 0644]
src/scan_test.cc [new file with mode: 0644]
src/status_code.cc [new file with mode: 0644]
src/symbol_decoder_context.cc [new file with mode: 0644]
src/symbol_decoder_context.h [new file with mode: 0644]
src/symbol_decoder_context_cdfs.inc [new file with mode: 0644]
src/symbol_decoder_context_test.cc [new file with mode: 0644]
src/threading_strategy.cc [new file with mode: 0644]
src/threading_strategy.h [new file with mode: 0644]
src/threading_strategy_test.cc [new file with mode: 0644]
src/tile.h [new file with mode: 0644]
src/tile/bitstream/mode_info.cc [new file with mode: 0644]
src/tile/bitstream/palette.cc [new file with mode: 0644]
src/tile/bitstream/partition.cc [new file with mode: 0644]
src/tile/bitstream/transform_size.cc [new file with mode: 0644]
src/tile/prediction.cc [new file with mode: 0644]
src/tile/tile.cc [new file with mode: 0644]
src/tile_scratch_buffer.cc [new file with mode: 0644]
src/tile_scratch_buffer.h [new file with mode: 0644]
src/utils/array_2d.h [new file with mode: 0644]
src/utils/array_2d_test.cc [new file with mode: 0644]
src/utils/bit_mask_set.h [new file with mode: 0644]
src/utils/bit_reader.cc [new file with mode: 0644]
src/utils/bit_reader.h [new file with mode: 0644]
src/utils/block_parameters_holder.cc [new file with mode: 0644]
src/utils/block_parameters_holder.h [new file with mode: 0644]
src/utils/block_parameters_holder_test.cc [new file with mode: 0644]
src/utils/blocking_counter.h [new file with mode: 0644]
src/utils/blocking_counter_test.cc [new file with mode: 0644]
src/utils/common.h [new file with mode: 0644]
src/utils/common_test.cc [new file with mode: 0644]
src/utils/compiler_attributes.h [new file with mode: 0644]
src/utils/constants.cc [new file with mode: 0644]
src/utils/constants.h [new file with mode: 0644]
src/utils/cpu.cc [new file with mode: 0644]
src/utils/cpu.h [new file with mode: 0644]
src/utils/cpu_test.cc [new file with mode: 0644]
src/utils/dynamic_buffer.h [new file with mode: 0644]
src/utils/entropy_decoder.cc [new file with mode: 0644]
src/utils/entropy_decoder.h [new file with mode: 0644]
src/utils/entropy_decoder_test.cc [new file with mode: 0644]
src/utils/entropy_decoder_test_data.inc [new file with mode: 0644]
src/utils/executor.cc [new file with mode: 0644]
src/utils/executor.h [new file with mode: 0644]
src/utils/libgav1_utils.cmake [new file with mode: 0644]
src/utils/logging.cc [new file with mode: 0644]
src/utils/logging.h [new file with mode: 0644]
src/utils/memory.h [new file with mode: 0644]
src/utils/memory_test.cc [new file with mode: 0644]
src/utils/queue.h [new file with mode: 0644]
src/utils/queue_test.cc [new file with mode: 0644]
src/utils/raw_bit_reader.cc [new file with mode: 0644]
src/utils/raw_bit_reader.h [new file with mode: 0644]
src/utils/raw_bit_reader_test.cc [new file with mode: 0644]
src/utils/reference_info.h [new file with mode: 0644]
src/utils/segmentation.cc [new file with mode: 0644]
src/utils/segmentation.h [new file with mode: 0644]
src/utils/segmentation_map.cc [new file with mode: 0644]
src/utils/segmentation_map.h [new file with mode: 0644]
src/utils/segmentation_map_test.cc [new file with mode: 0644]
src/utils/segmentation_test.cc [new file with mode: 0644]
src/utils/stack.h [new file with mode: 0644]
src/utils/stack_test.cc [new file with mode: 0644]
src/utils/threadpool.cc [new file with mode: 0644]
src/utils/threadpool.h [new file with mode: 0644]
src/utils/threadpool_test.cc [new file with mode: 0644]
src/utils/types.h [new file with mode: 0644]
src/utils/unbounded_queue.h [new file with mode: 0644]
src/utils/unbounded_queue_test.cc [new file with mode: 0644]
src/utils/vector.h [new file with mode: 0644]
src/utils/vector_test.cc [new file with mode: 0644]
src/version.cc [new file with mode: 0644]
src/version_test.cc [new file with mode: 0644]
src/warp_prediction.cc [new file with mode: 0644]
src/warp_prediction.h [new file with mode: 0644]
src/warp_prediction_test.cc [new file with mode: 0644]
src/yuv_buffer.cc [new file with mode: 0644]
src/yuv_buffer.h [new file with mode: 0644]
tests/block_utils.cc [new file with mode: 0644]
tests/block_utils.h [new file with mode: 0644]
tests/data/five-frames.ivf [new file with mode: 0644]
tests/data/ivf-header-and-truncated-frame-header [new file with mode: 0644]
tests/data/ivf-header-only [new file with mode: 0644]
tests/data/ivf-signature-only [new file with mode: 0644]
tests/data/one-frame-large-timestamp.ivf [new file with mode: 0644]
tests/data/one-frame-truncated.ivf [new file with mode: 0644]
tests/data/one-frame.ivf [new file with mode: 0644]
tests/fuzzer/decoder_fuzzer.cc [new file with mode: 0644]
tests/fuzzer/decoder_fuzzer_frame_parallel.cc [new file with mode: 0644]
tests/fuzzer/fuzzer_temp_file.h [new file with mode: 0644]
tests/fuzzer/obu_parser_fuzzer.cc [new file with mode: 0644]
tests/libgav1_tests.cmake [new file with mode: 0644]
tests/third_party/libvpx/LICENSE [new file with mode: 0644]
tests/third_party/libvpx/acm_random.h [new file with mode: 0644]
tests/third_party/libvpx/md5_helper.h [new file with mode: 0644]
tests/third_party/libvpx/md5_utils.cc [new file with mode: 0644]
tests/third_party/libvpx/md5_utils.h [new file with mode: 0644]
tests/utils.cc [new file with mode: 0644]
tests/utils.h [new file with mode: 0644]
tests/utils_test.cc [new file with mode: 0644]

diff --git a/.cmake-format.py b/.cmake-format.py
new file mode 100644 (file)
index 0000000..90499e5
--- /dev/null
@@ -0,0 +1,126 @@
+# Generated with cmake-format 0.5.4
+# --------------------------
+# General Formatting Options
+# --------------------------
+# How wide to allow formatted cmake files
+line_width = 80
+
+# How many spaces to tab for indent
+tab_size = 2
+
+# If arglists are longer than this, break them always
+max_subargs_per_line = 10
+
+# If true, separate flow control names from their parentheses with a space
+separate_ctrl_name_with_space = False
+
+# If true, separate function names from parentheses with a space
+separate_fn_name_with_space = False
+
+# If a statement is wrapped to more than one line, than dangle the closing
+# parenthesis on it's own line
+dangle_parens = False
+
+# If the statement spelling length (including space and parenthesis is larger
+# than the tab width by more than this amoung, then force reject un-nested
+# layouts.
+max_prefix_chars = 2
+
+# If a candidate layout is wrapped horizontally but it exceeds this many lines,
+# then reject the layout.
+max_lines_hwrap = 2
+
+# What style line endings to use in the output.
+line_ending = 'unix'
+
+# Format command names consistently as 'lower' or 'upper' case
+command_case = 'lower'
+
+# Format keywords consistently as 'lower' or 'upper' case
+keyword_case = 'unchanged'
+
+# Specify structure for custom cmake functions
+additional_commands = {
+  "foo": {
+    "flags": [
+      "BAR",
+      "BAZ"
+    ],
+    "kwargs": {
+      "HEADERS": "*",
+      "SOURCES": "*",
+      "DEPENDS": "*"
+    }
+  }
+}
+
+# A list of command names which should always be wrapped
+always_wrap = []
+
+# Specify the order of wrapping algorithms during successive reflow attempts
+algorithm_order = [0, 1, 2, 3, 4]
+
+# If true, the argument lists which are known to be sortable will be sorted
+# lexicographicall
+enable_sort = False
+
+# If true, the parsers may infer whether or not an argument list is sortable
+# (without annotation).
+autosort = False
+
+# If a comment line starts with at least this many consecutive hash characters,
+# then don't lstrip() them off. This allows for lazy hash rulers where the first
+# hash char is not separated by space
+hashruler_min_length = 10
+
+# A dictionary containing any per-command configuration overrides. Currently
+# only `command_case` is supported.
+per_command = {}
+
+
+# --------------------------
+# Comment Formatting Options
+# --------------------------
+# What character to use for bulleted lists
+bullet_char = '*'
+
+# What character to use as punctuation after numerals in an enumerated list
+enum_char = '.'
+
+# enable comment markup parsing and reflow
+enable_markup = True
+
+# If comment markup is enabled, don't reflow the first comment block in each
+# listfile. Use this to preserve formatting of your copyright/license
+# statements.
+first_comment_is_literal = True
+
+# If comment markup is enabled, don't reflow any comment block which matches
+# this (regex) pattern. Default is `None` (disabled).
+literal_comment_pattern = None
+
+# Regular expression to match preformat fences in comments
+# default=r'^\s*([`~]{3}[`~]*)(.*)$'
+fence_pattern = '^\\s*([`~]{3}[`~]*)(.*)$'
+
+# Regular expression to match rulers in comments
+# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'
+ruler_pattern = '^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'
+
+# If true, then insert a space between the first hash char and remaining hash
+# chars in a hash ruler, and normalize it's length to fill the column
+canonicalize_hashrulers = True
+
+
+# ---------------------------------
+# Miscellaneous Options
+# ---------------------------------
+# If true, emit the unicode byte-order mark (BOM) at the start of the file
+emit_byteorder_mark = False
+
+# Specify the encoding of the input file. Defaults to utf-8.
+input_encoding = 'utf-8'
+
+# Specify the encoding of the output file. Defaults to utf-8. Note that cmake
+# only claims to support utf-8 so be careful when using anything else
+output_encoding = 'utf-8'
diff --git a/.gitattributes b/.gitattributes
new file mode 100644 (file)
index 0000000..b934084
--- /dev/null
@@ -0,0 +1 @@
+* whitespace=tab-in-indent,space-before-tab,trailing-space
diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..87ccf24
--- /dev/null
@@ -0,0 +1,2 @@
+/build
+/third_party
diff --git a/AUTHORS b/AUTHORS
new file mode 100644 (file)
index 0000000..d92ea0a
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of libgav1 authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644 (file)
index 0000000..73f27a1
--- /dev/null
@@ -0,0 +1,170 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# libgav1 requires modern CMake.
+cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR)
+
+# libgav1 requires C++11.
+set(CMAKE_CXX_STANDARD 11)
+set(ABSL_CXX_STANDARD 11)
+# libgav1 requires C99.
+set(CMAKE_C_STANDARD 99)
+
+project(libgav1 CXX C)
+
+set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}")
+set(libgav1_build "${CMAKE_BINARY_DIR}")
+
+if("${libgav1_root}" STREQUAL "${libgav1_build}")
+  message(
+    FATAL_ERROR
+      "Building from within the libgav1 source tree is not supported.\n"
+      "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+      "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n"
+      "And re-run CMake from the libgav1_build directory.")
+endif()
+
+set(libgav1_examples "${libgav1_root}/examples")
+set(libgav1_source "${libgav1_root}/src")
+
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+               "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+               "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_EXAMPLES HELPSTRING "Enables examples." VALUE
+               ON)
+libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
+libgav1_option(
+  NAME LIBGAV1_VERBOSE HELPSTRING
+  "Enables verbose build system output. Higher numbers are more verbose." VALUE
+  OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Enable generators like Xcode and Visual Studio to place projects in folders.
+get_property(use_folders_is_set GLOBAL PROPERTY USE_FOLDERS SET)
+if(NOT use_folders_is_set)
+  set_property(GLOBAL PROPERTY USE_FOLDERS TRUE)
+endif()
+
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_root}/tests/libgav1_tests.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
+libgav1_optimization_detect()
+libgav1_set_build_definitions()
+libgav1_set_cxx_flags()
+libgav1_configure_sanitizer()
+
+# Supported bit depth.
+libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH)
+
+# C++ and linker flags.
+libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS)
+libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS)
+
+# Sanitizer integration.
+libgav1_track_configuration_variable(LIBGAV1_SANITIZE)
+
+# Generated source file directory.
+libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+
+# Controls use of std::mutex and absl::Mutex in ThreadPool.
+libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+if((DEFINED
+    LIBGAV1_THREADPOOL_USE_STD_MUTEX
+    AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+   OR NOT (DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX OR ANDROID OR IOS))
+  set(use_absl_threading TRUE)
+endif()
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
+
+set(libgav1_abseil_build "${libgav1_build}/abseil")
+set(libgav1_gtest_build "${libgav1_build}/gtest")
+
+# Compiler/linker flags must be lists, but come in from the environment as
+# strings. Break them up:
+if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_CXX_FLAGS)
+endif()
+if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
+endif()
+
+# Set test-only flags based on LIBGAV1_CXX_FLAGS.
+libgav1_set_test_flags()
+
+set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
+if(EXISTS "${libgav1_abseil}")
+  set(ABSL_PROPAGATE_CXX_STD ON)
+  add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}"
+                   EXCLUDE_FROM_ALL)
+else()
+  if(use_absl_threading OR LIBGAV1_ENABLE_EXAMPLES OR LIBGAV1_ENABLE_TESTS)
+    message(
+      FATAL_ERROR
+        "Abseil not found. This dependency is required by the"
+        " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+        " not defined. To continue, download the Abseil repository to"
+        " third_party/abseil-cpp:\n  git \\\n    -C ${libgav1_root} \\\n"
+        "    clone -b 20220623.0 --depth 1 \\\n"
+        "    https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+  endif()
+endif()
+
+libgav1_reset_target_lists()
+libgav1_add_dsp_targets()
+libgav1_add_decoder_targets()
+libgav1_add_examples_targets()
+libgav1_add_tests_targets()
+libgav1_add_utils_targets()
+libgav1_setup_install_target()
+
+if(LIBGAV1_ENABLE_TESTS)
+  # include(CTest) or -DBUILD_TESTING=1 aren't used to avoid enabling abseil
+  # tests.
+  enable_testing()
+endif()
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644 (file)
index 0000000..69140ff
--- /dev/null
@@ -0,0 +1,27 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644 (file)
index 0000000..bdf598c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,197 @@
+# libgav1 -- an AV1 decoder
+
+libgav1 is a Main profile (0), High profile (1) & Professional profile (2)
+compliant AV1 decoder. More information on the AV1 video format can be found at
+[aomedia.org](https://aomedia.org).
+
+[TOC]
+
+## Building
+
+### Prerequisites
+
+1.  A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are
+    recommended.
+
+2.  [CMake >= 3.7.1](https://cmake.org/download/)
+
+3.  [Abseil](https://abseil.io)
+
+    From within the libgav1 directory:
+
+    ```shell
+    $ git clone -b 20220623.0 --depth 1 \
+      https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    ```
+
+    Note: Abseil is required by the examples and tests. libgav1 will depend on
+    it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below).
+
+4.  (Optional) [GoogleTest](https://github.com/google/googletest)
+
+    From within the libgav1 directory:
+
+    ```shell
+    $ git clone -b release-1.12.1 --depth 1 \
+      https://github.com/google/googletest.git third_party/googletest
+    ```
+
+### Compile
+
+```shell
+  $ mkdir build && cd build
+  $ cmake -G "Unix Makefiles" ..
+  $ make
+```
+
+Configuration options:
+
+*   `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10, 12;
+    default: 12).
+*   `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable
+    [symbol reduction](#symbol-reduction) in an optimized build to keep all
+    versions of dsp functions available. Automatically defined in
+    `src/dsp/dsp.h` if unset.
+*   `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+*   `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+*   `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note
+    setting this to 0 will also disable AVX2.
+*   `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
+    Automatically defined in `src/utils/logging.h` if unset.
+*   `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
+    the examples. Automatically defined in `examples/logging.h` if unset.
+*   `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform
+    coefficient range checks.
+*   `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum
+    LogSeverity` in `src/utils/logging.h`. Automatically defined in
+    `src/utils/logging.cc` if unset.
+*   `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
+    absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
+    dependency from the core library. Automatically defined in
+    `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0
+    otherwise.
+*   `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+    allowed to create. Has to be an integer > 0. Otherwise this is ignored. The
+    default value is 128.
+*   `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
+    is used to determine when to use frame parallel decoding. Frame parallel
+    decoding will be used if |threads| > |tile_count| * this multiplier. Has to
+    be an integer > 0. The default value is 4. This is an advanced setting
+    intended for testing purposes.
+
+For additional options see:
+
+```shell
+  $ cmake .. -LH
+```
+
+## Testing
+
+*   `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for
+    options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to
+    convert other container formats to IVF.
+
+*   Unit tests are built when `LIBGAV1_ENABLE_TESTS` is set to `1`. The binaries
+    can be invoked directly or with
+    [`ctest`](https://cmake.org/cmake/help/latest/manual/ctest.1.html).
+
+    *   The test input location can be given by setting the
+        `LIBGAV1_TEST_DATA_PATH` environment variable; it defaults to
+        `<libgav1_src>/tests/data`, where `<libgav1_src>` is `/data/local/tmp`
+        on Android platforms or the source directory configured with cmake
+        otherwise.
+
+    *   Output is written to the value of the `TMPDIR` or `TEMP` environment
+        variables in that order if set, otherwise `/data/local/tmp` on Android
+        platforms, the value of `LIBGAV1_FLAGS_TMPDIR` if defined during
+        compilation or the current directory if not.
+
+## Development
+
+### Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches.
+
+### Style
+
+libgav1 follows the
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with
+formatting enforced by `clang-format`.
+
+### Comments
+
+Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the
+spec`' reference the relevant section(s) in the
+[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf).
+
+### DSP structure
+
+*   `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`.
+    This handles cpu-detection and initializing each logical unit which populate
+    `libgav1::dsp::Dsp` function tables.
+*   `src/dsp/dsp.h` contains function and type definitions for all logical units
+    (e.g., intra-predictors)
+*   `src/utils/cpu.h` contains definitions for cpu-detection
+*   base implementations are located in `src/dsp/*.{h,cc}` with platform
+    specific optimizations in sub-folders
+*   unit tests define `DISABLED_Speed` test(s) to allow timing of individual
+    functions
+
+#### Symbol reduction
+
+Based on the build configuration unneeded lesser optimizations are removed using
+a hierarchical include and define system. Each logical unit in `src/dsp` should
+include all platform specific headers in descending order to allow higher level
+optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an
+example.
+
+Each function receives a new define which can be checked in platform specific
+headers. The format is: `LIBGAV1_<Dsp-table>_FunctionName` or
+`LIBGAV1_<Dsp-table>_[sub-table-index1][...-indexN]`, e.g.,
+`LIBGAV1_Dsp8bpp_AverageBlend`,
+`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of
+the form `Dsp<bitdepth>bpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for
+bits per pixel). The indices correspond to enum values used as lookups with
+leading 'k' removed. Platform specific headers then should first check if the
+symbol is defined and if not set the value to the corresponding
+`LIBGAV1_CPU_<arch>` value from `src/utils/cpu.h`.
+
+```
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+  #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+  #endif
+```
+
+Within each module the code should check if the symbol is defined to its
+specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before
+defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to
+simplify this check for optimized code.
+
+```
+  #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  ...
+
+  // In unoptimized code use the following structure; there's no equivalent
+  // define for LIBGAV1_CPU_C as it would require duplicating the function
+  // defines used in optimized code for only a small benefit to this
+  // boilerplate.
+  #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  ...
+  #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+  ...
+```
+
+## Bugs
+
+Please report all bugs to the issue tracker:
+https://issuetracker.google.com/issues/new?component=750480&template=1355007
+
+## Discussion
+
+Email: gav1-devel@googlegroups.com
+
+Web: https://groups.google.com/forum/#!forum/gav1-devel
diff --git a/cmake/libgav1-config.cmake.template b/cmake/libgav1-config.cmake.template
new file mode 100644 (file)
index 0000000..dc253d3
--- /dev/null
@@ -0,0 +1,2 @@
+set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@")
+set(LIBGAV1_LIBRARIES "gav1")
diff --git a/cmake/libgav1.pc.template b/cmake/libgav1.pc.template
new file mode 100644 (file)
index 0000000..c571a43
--- /dev/null
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PROJECT_NAME@
+Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit).
+Version: @LIBGAV1_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lgav1
+Libs.private: @CMAKE_THREAD_LIBS_INIT@
diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake
new file mode 100644 (file)
index 0000000..1465679
--- /dev/null
@@ -0,0 +1,168 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1)
+
+macro(libgav1_set_build_definitions)
+  string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+
+  libgav1_load_version_info()
+
+  # Library version info. See the libtool docs for updating the values:
+  # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+  #
+  # c=<current>, r=<revision>, a=<age>
+  #
+  # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+  # passed to libtool.
+  #
+  # We set LIBGAV1_SOVERSION = [c-a].a.r
+  set(LT_CURRENT 1)
+  set(LT_REVISION 0)
+  set(LT_AGE 0)
+  math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
+  set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
+  unset(LT_CURRENT)
+  unset(LT_REVISION)
+  unset(LT_AGE)
+
+  list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
+              "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
+  list(APPEND libgav1_gtest_include_paths
+              "third_party/googletest/googlemock/include"
+              "third_party/googletest/googletest/include"
+              "third_party/googletest/googletest")
+  list(APPEND libgav1_test_include_paths ${libgav1_include_paths}
+              ${libgav1_gtest_include_paths})
+  list(APPEND libgav1_defines "LIBGAV1_CMAKE=1"
+              "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\""
+              "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"")
+
+  if(MSVC OR WIN32)
+    list(APPEND libgav1_defines "_CRT_SECURE_NO_WARNINGS" "NOMINMAX"
+                "_SCL_SECURE_NO_WARNINGS")
+  endif()
+
+  if(ANDROID)
+    if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
+      set(CMAKE_ANDROID_ARM_MODE ON)
+    endif()
+
+    if(build_type_lowercase MATCHES "rel")
+      list(APPEND libgav1_base_cxx_flags "-fno-stack-protector")
+    endif()
+  endif()
+
+  list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations"
+              "-Wno-sign-compare" "-fvisibility=hidden"
+              "-fvisibility-inlines-hidden")
+
+  if(BUILD_SHARED_LIBS)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    set(libgav1_dependency libgav1_shared)
+  else()
+    set(libgav1_dependency libgav1_static)
+  endif()
+
+  list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes"
+              "-Wshorten-64-to-32")
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+      # Quiet warnings in copy-list-initialization where {} elision has always
+      # been allowed.
+      list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+    endif()
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+      list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt")
+    endif()
+  endif()
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
+      # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
+      list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation")
+
+      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7")
+        # Quiet gcc 6 vs 7 abi warnings:
+        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+        list(APPEND libgav1_base_cxx_flags "-Wno-psabi")
+        list(APPEND ABSL_GCC_FLAGS "-Wno-psabi")
+      endif()
+    endif()
+  endif()
+
+  if(build_type_lowercase MATCHES "rel")
+    list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608")
+  endif()
+
+  list(APPEND libgav1_msvc_cxx_flags
+              # Warning level 3.
+              "/W3"
+              # Disable warning C4018:
+              # '<comparison operator>' signed/unsigned mismatch
+              "/wd4018"
+              # Disable warning C4244:
+              # 'argument': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4244"
+              # Disable warning C4267:
+              # '=': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4267"
+              # Disable warning C4309:
+              # 'argument': truncation of constant value
+              "/wd4309"
+              # Disable warning C4551:
+              # function call missing argument list
+              "/wd4551")
+
+  if(BUILD_SHARED_LIBS)
+    list(APPEND libgav1_msvc_cxx_flags
+                # Disable warning C4251:
+                # 'libgav1::DecoderImpl class member' needs to have
+                # dll-interface to be used by clients of class
+                # 'libgav1::Decoder'.
+                "/wd4251")
+  endif()
+
+  if(NOT LIBGAV1_MAX_BITDEPTH)
+    set(LIBGAV1_MAX_BITDEPTH 12)
+  elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8
+         AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10
+         AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 12)
+    libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.")
+  endif()
+
+  list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
+
+  if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+    if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0
+       AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1)
+      libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.")
+    endif()
+
+    list(APPEND libgav1_defines
+         "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}")
+  endif()
+
+  # Source file names ending in these suffixes will have the appropriate
+  # compiler flags added to their compile commands to enable intrinsics.
+  set(libgav1_avx2_source_file_suffix "avx2(_test)?.cc")
+  set(libgav1_neon_source_file_suffix "neon(_test)?.cc")
+  set(libgav1_sse4_source_file_suffix "sse4(_test)?.cc")
+endmacro()
diff --git a/cmake/libgav1_cpu_detection.cmake b/cmake/libgav1_cpu_detection.cmake
new file mode 100644 (file)
index 0000000..d79b83a
--- /dev/null
@@ -0,0 +1,52 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1)
+
+# Detect optimizations available for the current target CPU.
+macro(libgav1_optimization_detect)
+  if(LIBGAV1_ENABLE_OPTIMIZATIONS)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+    if(cpu_lowercase MATCHES "^arm|^aarch64")
+      set(libgav1_have_neon ON)
+    elseif(cpu_lowercase MATCHES "^x86|amd64")
+      set(libgav1_have_avx2 ON)
+      set(libgav1_have_sse4 ON)
+    endif()
+  endif()
+
+  if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0")
+    set(libgav1_have_avx2 OFF)
+  endif()
+
+  if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0")
+    set(libgav1_have_neon, OFF)
+  endif()
+
+  if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0")
+    set(libgav1_have_sse4 OFF)
+  endif()
+endmacro()
diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake
new file mode 100644 (file)
index 0000000..4f2c4fd
--- /dev/null
@@ -0,0 +1,276 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1)
+
+include(CheckCXXCompilerFlag)
+include(CheckCXXSourceCompiles)
+
+# Adds compiler flags specified by FLAGS to the sources specified by SOURCES:
+#
+# libgav1_set_compiler_flags_for_sources(SOURCES <sources> FLAGS <flags>)
+macro(libgav1_set_compiler_flags_for_sources)
+  unset(compiler_SOURCES)
+  unset(compiler_FLAGS)
+  unset(optional_args)
+  unset(single_value_args)
+  set(multi_value_args SOURCES FLAGS)
+  cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (compiler_SOURCES AND compiler_FLAGS))
+    libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and "
+                "FLAGS required.")
+  endif()
+
+  set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS
+                              ${compiler_FLAGS})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    foreach(source ${compiler_SOURCES})
+      foreach(flag ${compiler_FLAGS})
+        message("libgav1_set_compiler_flags_for_sources: source:${source} "
+                "flag:${flag}")
+      endforeach()
+    endforeach()
+  endif()
+endmacro()
+
+# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds
+# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if
+# FLAG_REQUIRED is specified and any flag check fails.
+#
+# ~~~
+# libgav1_test_cxx_flag(<FLAG_LIST_VAR_NAMES <flag list variable(s)>>
+#                       [FLAG_REQUIRED])
+# ~~~
+macro(libgav1_test_cxx_flag)
+  unset(cxx_test_FLAG_LIST_VAR_NAMES)
+  unset(cxx_test_FLAG_REQUIRED)
+  unset(single_value_args)
+  set(optional_args FLAG_REQUIRED)
+  set(multi_value_args FLAG_LIST_VAR_NAMES)
+  cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cxx_test_FLAG_LIST_VAR_NAMES)
+    libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required")
+  endif()
+
+  unset(cxx_flags)
+  foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES})
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags")
+    endif()
+    list(APPEND cxx_flags ${${list_var}})
+  endforeach()
+
+  if(LIBGAV1_VERBOSE)
+    message("CXX test: all flags: ${cxx_flags}")
+  endif()
+
+  unset(all_cxx_flags)
+  list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  # Run the actual compile test.
+  unset(libgav1_all_cxx_flags_pass CACHE)
+  message("--- Running combined CXX flags test, flags: ${all_cxx_flags}")
+  check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass)
+
+  if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass)
+    libgav1_die("Flag test failed for required flag(s): "
+                "${all_cxx_flags} and FLAG_REQUIRED specified.")
+  endif()
+
+  if(libgav1_all_cxx_flags_pass)
+    # Test passed: update the global flag list used by the libgav1 target
+    # creation wrappers.
+    set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+
+    if(LIBGAV1_VERBOSE)
+      message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}")
+    endif()
+
+    message("--- Passed combined CXX flags test")
+  else()
+    message("--- Failed combined CXX flags test, testing flags individually.")
+
+    if(cxx_flags)
+      message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
+      foreach(cxx_flag ${cxx_flags})
+        # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal
+        # variable at parent scope while check_cxx_source_compiles() continues
+        # to set an internal cache variable, so we unset both to avoid the
+        # failure / success state persisting between checks. See
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+        unset(cxx_flag_test_passed)
+        unset(cxx_flag_test_passed CACHE)
+        message("--- Testing flag: ${cxx_flag}")
+        check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
+
+        if(cxx_flag_test_passed)
+          message("--- Passed test for ${cxx_flag}")
+        else()
+          list(REMOVE_ITEM cxx_flags ${cxx_flag})
+          message("--- Failed test for ${cxx_flag}, flag removed.")
+        endif()
+      endforeach()
+
+      set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+  endif()
+endmacro()
+
+# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME,
+# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates
+# configuration when flag check fails. libgav1_set_cxx_flags() must be called
+# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only
+# valid CXX flags.
+#
+# libgav1_test_exe_linker_flag(<FLAG_LIST_VAR_NAME <flag list variable)>)
+macro(libgav1_test_exe_linker_flag)
+  unset(link_FLAG_LIST_VAR_NAME)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args FLAG_LIST_VAR_NAME)
+  cmake_parse_arguments(link "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT link_FLAG_LIST_VAR_NAME)
+    libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required")
+  endif()
+
+  libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS
+                            ${link_FLAG_LIST_VAR_NAME})
+
+  if(LIBGAV1_VERBOSE)
+    message("EXE LINKER test: all flags: ${linker_flags}")
+  endif()
+
+  # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the
+  # linker test.
+  libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS
+                            LIBGAV1_CXX_FLAGS)
+
+  # Cache the global exe linker flags.
+  if(CMAKE_EXE_LINKER_FLAGS)
+    set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+    libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE
+                              ${linker_flags})
+  endif()
+
+  libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags}
+                            ${CMAKE_EXE_LINKER_FLAGS})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  message("--- Running EXE LINKER test for flags: ${linker_flags}")
+
+  unset(linker_flag_test_passed CACHE)
+  set(libgav1_cxx_main "\nint main() { return 0; }")
+  check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed)
+
+  if(NOT linker_flag_test_passed)
+    libgav1_die("EXE LINKER test failed.")
+  endif()
+
+  message("--- Passed EXE LINKER flag test.")
+
+  # Restore cached global exe linker flags.
+  if(cached_CMAKE_EXE_LINKER_FLAGS)
+    set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS})
+  else()
+    unset(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+endmacro()
+
+# Runs the libgav1 compiler tests. This macro builds up the list of list var(s)
+# that is passed to libgav1_test_cxx_flag().
+#
+# Note: libgav1_set_build_definitions() must be called before this macro.
+macro(libgav1_set_cxx_flags)
+  unset(cxx_flag_lists)
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+    list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
+  endif()
+
+  # Append clang flags after the base set to allow -Wno* overrides to take
+  # effect. Some of the base flags may enable a large set of warnings, e.g.,
+  # -Wall.
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+  endif()
+
+  if(MSVC)
+    list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
+  endif()
+
+  if(LIBGAV1_VERBOSE)
+    if(cxx_flag_lists)
+      libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists})
+      message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}")
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS)
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}")
+    endif()
+  endif()
+
+  libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
+endmacro()
+
+# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS.
+#
+# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore,
+# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS
+# are complete.
+macro(libgav1_set_test_flags)
+  if(LIBGAV1_ENABLE_TESTS)
+    set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS})
+    list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than")
+
+    if(NOT CMAKE_CXX_COMPILER_ID STREQUAL CMAKE_C_COMPILER_ID)
+      message(
+        FATAL_ERROR
+          "C/CXX compiler mismatch (${CMAKE_C_COMPILER_ID} vs"
+          " ${CMAKE_CXX_COMPILER_ID})! Compiler flags are only tested using"
+          " CMAKE_CXX_COMPILER, rerun cmake with CMAKE_C_COMPILER set to the"
+          " C compiler from the same package as CMAKE_CXX_COMPILER to ensure"
+          " the build completes successfully.")
+    endif()
+    set(LIBGAV1_TEST_C_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+    list(FILTER LIBGAV1_TEST_C_FLAGS EXCLUDE REGEX
+         "-fvisibility-inlines-hidden")
+  endif()
+endmacro()
diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake
new file mode 100644 (file)
index 0000000..ac16257
--- /dev/null
@@ -0,0 +1,140 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
+
+# Kills build generation using message(FATAL_ERROR) and outputs all data passed
+# to the console via use of $ARGN.
+macro(libgav1_die)
+  # macro parameters are not variables so a temporary is needed to work with
+  # list().
+  set(msg ${ARGN})
+  # message(${ARGN}) will merge all list elements with no separator while
+  # "${ARGN}" will output the list as a ';' delimited string.
+  list(JOIN msg " " msg)
+  message(FATAL_ERROR "${msg}")
+endmacro()
+
+# Converts semi-colon delimited list variable(s) to string. Output is written to
+# variable supplied via the DEST parameter. Input is from an expanded variable
+# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS.
+macro(libgav1_set_and_stringify)
+  set(optional_args)
+  set(single_value_args DEST SOURCE_VAR)
+  set(multi_value_args SOURCE SOURCE_VARS)
+  cmake_parse_arguments(sas "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS))
+    libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE "
+                "SOURCE_VARS required.")
+  endif()
+
+  unset(${sas_DEST})
+
+  if(sas_SOURCE)
+    # $sas_SOURCE is one or more expanded variables, just copy the values to
+    # $sas_DEST.
+    set(${sas_DEST} "${sas_SOURCE}")
+  endif()
+
+  if(sas_SOURCE_VARS)
+    # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a
+    # variable and appends it to $sas_DEST.
+    foreach(source_var ${sas_SOURCE_VARS})
+      set(${sas_DEST} "${${sas_DEST}} ${${source_var}}")
+    endforeach()
+
+    # Because $sas_DEST can be empty when entering this scope leading whitespace
+    # can be introduced to $sas_DEST on the first iteration of the above loop.
+    # Remove it:
+    string(STRIP "${${sas_DEST}}" ${sas_DEST})
+  endif()
+
+  # Lists in CMake are simply semicolon delimited strings, so stringification is
+  # just a find and replace of the semicolon.
+  string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}")
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}")
+  endif()
+endmacro()
+
+# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds
+# it to the specified target. Optionally adds its path to a list variable.
+#
+# libgav1_create_dummy_source_file(<TARGET <target> BASENAME <basename of file>>
+# [LISTVAR <list variable>])
+macro(libgav1_create_dummy_source_file)
+  set(optional_args)
+  set(single_value_args TARGET BASENAME LISTVAR)
+  set(multi_value_args)
+  cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cdsf_TARGET OR NOT cdsf_BASENAME)
+    libgav1_die(
+      "libgav1_create_dummy_source_file: TARGET and BASENAME required.")
+  endif()
+
+  if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+    set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src")
+  endif()
+
+  set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}")
+  set(dummy_source_file
+      "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
+  set(dummy_source_code
+      "// Generated file. DO NOT EDIT!\n"
+      "// C++ source file created for target ${cdsf_TARGET}.\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
+  file(WRITE "${dummy_source_file}" ${dummy_source_code})
+
+  target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
+
+  if(cdsf_LISTVAR)
+    list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}")
+  endif()
+endmacro()
+
+# Loads the version components from $libgav1_source/gav1/version.h and sets the
+# corresponding CMake variables:
+# - LIBGAV1_MAJOR_VERSION
+# - LIBGAV1_MINOR_VERSION
+# - LIBGAV1_PATCH_VERSION
+# - LIBGAV1_VERSION, which is:
+#   - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION
+macro(libgav1_load_version_info)
+  file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings)
+  foreach(str ${version_file_strings})
+    if(str MATCHES "#define LIBGAV1_")
+      if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ")
+        string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION
+                       "${str}")
+      endif()
+    endif()
+  endforeach()
+  set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}")
+  set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}")
+endmacro()
diff --git a/cmake/libgav1_install.cmake b/cmake/libgav1_install.cmake
new file mode 100644 (file)
index 0000000..e2c79b9
--- /dev/null
@@ -0,0 +1,62 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1)
+
+# Sets up the Libgav1 install targets. Must be called after the static library
+# target is created.
+macro(libgav1_setup_install_target)
+  if(NOT (MSVC OR XCODE))
+    include(GNUInstallDirs)
+
+    # pkg-config: libgav1.pc
+    set(prefix "${CMAKE_INSTALL_PREFIX}")
+    set(exec_prefix "\${prefix}")
+    set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+    set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    set(libgav1_lib_name "libgav1")
+
+    configure_file("${libgav1_root}/cmake/libgav1.pc.template"
+                   "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX)
+    install(FILES "${libgav1_build}/libgav1.pc"
+            DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+    # CMake config: libgav1-config.cmake
+    set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template"
+                   "${libgav1_build}/libgav1-config.cmake" @ONLY
+                   NEWLINE_STYLE UNIX)
+    install(
+      FILES "${libgav1_build}/libgav1-config.cmake"
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake")
+
+    install(
+      FILES ${libgav1_api_includes}
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
+
+    if(LIBGAV1_ENABLE_EXAMPLES)
+      install(TARGETS gav1_decode DESTINATION
+                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+    endif()
+    install(TARGETS libgav1_static DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    if(BUILD_SHARED_LIBS)
+      install(TARGETS libgav1_shared DESTINATION
+                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_intrinsics.cmake b/cmake/libgav1_intrinsics.cmake
new file mode 100644 (file)
index 0000000..a2e9ddb
--- /dev/null
@@ -0,0 +1,135 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1)
+
+# Returns the compiler flag for the SIMD intrinsics suffix specified by the
+# SUFFIX argument via the variable specified by the VARIABLE argument:
+# libgav1_get_intrinsics_flag_for_suffix(SUFFIX <suffix> VARIABLE <var name>)
+macro(libgav1_get_intrinsics_flag_for_suffix)
+  unset(intrinsics_SUFFIX)
+  unset(intrinsics_VARIABLE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args SUFFIX VARIABLE)
+  cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE))
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and "
+                        "VARIABLE required.")
+  endif()
+
+  if(intrinsics_SUFFIX MATCHES "neon")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
+    endif()
+  elseif(intrinsics_SUFFIX MATCHES "avx2")
+    if(MSVC)
+      set(${intrinsics_VARIABLE} "/arch:AVX2")
+    else()
+      set(${intrinsics_VARIABLE} "-mavx2")
+    endif()
+  elseif(intrinsics_SUFFIX MATCHES "sse4")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "-msse4.1")
+    endif()
+  else()
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown "
+                        "instrinics suffix: ${intrinsics_SUFFIX}")
+  endif()
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_get_intrinsics_flag_for_suffix: "
+            "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}")
+  endif()
+endmacro()
+
+# Processes source files specified by SOURCES and adds intrinsics flags as
+# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
+#
+# Detects requirement for intrinsics flags using source file name suffix.
+# Currently supports AVX2 and SSE4.1.
+macro(libgav1_process_intrinsics_sources)
+  unset(arg_TARGET)
+  unset(arg_SOURCES)
+  unset(optional_args)
+  set(single_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(arg "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+  if(NOT (arg_TARGET AND arg_SOURCES))
+    message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and "
+                        "SOURCES required.")
+  endif()
+
+  if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2)
+    unset(avx2_sources)
+    list(APPEND avx2_sources ${arg_SOURCES})
+
+    list(FILTER avx2_sources INCLUDE REGEX
+         "${libgav1_avx2_source_file_suffix}$")
+
+    if(avx2_sources)
+      unset(avx2_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_avx2_source_file_suffix}
+                                             VARIABLE avx2_flags)
+      if(avx2_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS
+                                               ${avx2_flags})
+      endif()
+    endif()
+  endif()
+
+  if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
+    unset(sse4_sources)
+    list(APPEND sse4_sources ${arg_SOURCES})
+
+    list(FILTER sse4_sources INCLUDE REGEX
+         "${libgav1_sse4_source_file_suffix}$")
+
+    if(sse4_sources)
+      unset(sse4_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_sse4_source_file_suffix}
+                                             VARIABLE sse4_flags)
+      if(sse4_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS
+                                               ${sse4_flags})
+      endif()
+    endif()
+  endif()
+
+  if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon)
+    unset(neon_sources)
+    list(APPEND neon_sources ${arg_SOURCES})
+    list(FILTER neon_sources INCLUDE REGEX
+         "${libgav1_neon_source_file_suffix}$")
+
+    if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG)
+      unset(neon_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_neon_source_file_suffix}
+                                             VARIABLE neon_flags)
+      if(neon_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS
+                                               ${neon_flags})
+      endif()
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_options.cmake b/cmake/libgav1_options.cmake
new file mode 100644 (file)
index 0000000..6327bee
--- /dev/null
@@ -0,0 +1,55 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+
+# Simple wrapper for CMake's builtin option command that tracks libgav1's build
+# options in the list variable $libgav1_options.
+macro(libgav1_option)
+  unset(option_NAME)
+  unset(option_HELPSTRING)
+  unset(option_VALUE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args NAME HELPSTRING VALUE)
+  cmake_parse_arguments(option "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE))
+    message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.")
+  endif()
+
+  option(${option_NAME} ${option_HELPSTRING} ${option_VALUE})
+
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("--------- libgav1_option ---------\n"
+            "option_NAME=${option_NAME}\n"
+            "option_HELPSTRING=${option_HELPSTRING}\n"
+            "option_VALUE=${option_VALUE}\n"
+            "------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_options ${option_NAME})
+  list(REMOVE_DUPLICATES libgav1_options)
+endmacro()
+
+# Dumps the $libgav1_options list via CMake message command.
+macro(libgav1_dump_options)
+  foreach(option_name ${libgav1_options})
+    message("${option_name}: ${${option_name}}")
+  endforeach()
+endmacro()
diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake
new file mode 100644 (file)
index 0000000..2f9ee07
--- /dev/null
@@ -0,0 +1,47 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1)
+
+macro(libgav1_configure_sanitizer)
+  if(LIBGAV1_SANITIZE AND NOT MSVC)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      if(LIBGAV1_SANITIZE MATCHES "cfi")
+        list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi"
+                    "-fuse-ld=gold")
+      endif()
+
+      if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+         AND LIBGAV1_SANITIZE MATCHES "integer|undefined")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s")
+      endif()
+    endif()
+
+    list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+    list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+
+    # Make sanitizer callstacks accurate.
+    list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
+                "-fno-optimize-sibling-calls")
+
+    # Check the linker flags first as they may be required in the compile check
+    # to avoid undefined symbols related to the sanitizer.
+    libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+  endif()
+endmacro()
diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake
new file mode 100644 (file)
index 0000000..f8326a9
--- /dev/null
@@ -0,0 +1,397 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_
+set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1)
+
+if(LIBGAV1_IDE_FOLDER)
+  set(LIBGAV1_EXAMPLES_IDE_FOLDER "${LIBGAV1_IDE_FOLDER}/examples")
+  set(LIBGAV1_TESTS_IDE_FOLDER "${LIBGAV1_IDE_FOLDER}/tests")
+else()
+  set(LIBGAV1_EXAMPLES_IDE_FOLDER "libgav1_examples")
+  set(LIBGAV1_TESTS_IDE_FOLDER "libgav1_tests")
+endif()
+
+# Resets list variables used to track libgav1 targets.
+macro(libgav1_reset_target_lists)
+  unset(libgav1_targets)
+  unset(libgav1_exe_targets)
+  unset(libgav1_lib_targets)
+  unset(libgav1_objlib_targets)
+  unset(libgav1_sources)
+  unset(libgav1_test_targets)
+endmacro()
+
+# Creates an executable target. The target name is passed as a parameter to the
+# NAME argument, and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME.
+#   - TEST: Flag. Presence means treat executable as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+# cmake-format: on
+#
+# Sources passed to this macro are added to $libgav1_test_sources when TEST is
+# specified. Otherwise sources are added to $libgav1_sources.
+#
+# Targets passed to this macro are always added $libgav1_targets. When TEST is
+# specified targets are also added to list $libgav1_test_targets. Otherwise
+# targets are added to $libgav1_exe_targets.
+macro(libgav1_add_executable)
+  unset(exe_TEST)
+  unset(exe_TEST_DEFINES_MAIN)
+  unset(exe_NAME)
+  unset(exe_OUTPUT_NAME)
+  unset(exe_SOURCES)
+  unset(exe_DEFINES)
+  unset(exe_INCLUDES)
+  unset(exe_COMPILE_FLAGS)
+  unset(exe_LINK_FLAGS)
+  unset(exe_OBJLIB_DEPS)
+  unset(exe_LIB_DEPS)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS)
+
+  cmake_parse_arguments(exe "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_executable ---------\n"
+            "exe_TEST=${exe_TEST}\n"
+            "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n"
+            "exe_NAME=${exe_NAME}\n"
+            "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n"
+            "exe_SOURCES=${exe_SOURCES}\n"
+            "exe_DEFINES=${exe_DEFINES}\n"
+            "exe_INCLUDES=${exe_INCLUDES}\n"
+            "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n"
+            "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n"
+            "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n"
+            "exe_LIB_DEPS=${exe_LIB_DEPS}\n"
+            "------------------------------------------\n")
+  endif()
+
+  if(NOT (exe_NAME AND exe_SOURCES))
+    message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${exe_NAME})
+  if(exe_TEST)
+    list(APPEND libgav1_test_targets ${exe_NAME})
+    list(APPEND libgav1_test_sources ${exe_SOURCES})
+  else()
+    list(APPEND libgav1_exe_targets ${exe_NAME})
+    list(APPEND libgav1_sources ${exe_SOURCES})
+  endif()
+
+  add_executable(${exe_NAME} ${exe_SOURCES})
+  if(exe_TEST)
+    add_test(NAME ${exe_NAME} COMMAND ${exe_NAME})
+    set_property(TARGET ${exe_NAME} PROPERTY FOLDER ${LIBGAV1_TESTS_IDE_FOLDER})
+  else()
+    set_property(TARGET ${exe_NAME}
+                 PROPERTY FOLDER ${LIBGAV1_EXAMPLES_IDE_FOLDER})
+  endif()
+
+  if(exe_OUTPUT_NAME)
+    set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME})
+  endif()
+
+  libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES})
+
+  if(exe_DEFINES)
+    target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES})
+  endif()
+
+  if(exe_INCLUDES)
+    target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
+  endif()
+
+  unset(exe_LIBGAV1_COMPILE_FLAGS)
+  if(exe_TEST)
+    list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$")
+    list(LENGTH exe_SOURCES exe_SOURCES_length)
+    if(exe_SOURCES_length EQUAL 0)
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+    else()
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS})
+    endif()
+  else()
+    set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS)
+    target_compile_options(${exe_NAME}
+                           PRIVATE ${exe_COMPILE_FLAGS}
+                                   ${exe_LIBGAV1_COMPILE_FLAGS})
+  endif()
+
+  if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
+    list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}")
+    if(${CMAKE_VERSION} VERSION_LESS "3.13")
+      # LINK_FLAGS is managed as a string.
+      libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS)
+      set_target_properties(${exe_NAME}
+                            PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}")
+    else()
+      target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS})
+    endif()
+  endif()
+
+  if(exe_OBJLIB_DEPS)
+    foreach(objlib_dep ${exe_OBJLIB_DEPS})
+      target_sources(${exe_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(CMAKE_THREAD_LIBS_INIT)
+    list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+  endif()
+
+  if(exe_LIB_DEPS)
+    unset(exe_static)
+    if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static")
+      set(exe_static ON)
+    endif()
+
+    if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+      # Third party dependencies can introduce dependencies on system and test
+      # libraries. Since the target created here is an executable, and CMake
+      # does not provide a method of controlling order of link dependencies,
+      # wrap all of the dependencies of this target in start/end group flags to
+      # ensure that dependencies of third party targets can be resolved when
+      # those dependencies happen to be resolved by dependencies of the current
+      # target.
+      list(INSERT exe_LIB_DEPS 0 -Wl,--start-group)
+      list(APPEND exe_LIB_DEPS -Wl,--end-group)
+    endif()
+    target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS})
+  endif()
+endmacro()
+
+# Creates a library target of the specified type. The target name is passed as a
+# parameter to the NAME argument, the type as a parameter to the TYPE argument,
+# and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_library(NAME <name> TYPE <type> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake
+#     is generating a build for which MSVC or WIN32 are true. This is to avoid
+#     output basename collisions with DLL import libraries.
+#   - TEST: Flag. Presence means treat library as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+#   - PUBLIC_INCLUDES: List of include paths to export to dependents.
+# cmake-format: on
+#
+# Sources passed to the macro are added to the lists tracking libgav1 sources:
+# cmake-format: off
+#   - When TEST is specified sources are added to $libgav1_test_sources.
+#   - Otherwise sources are added to $libgav1_sources.
+# cmake-format: on
+#
+# Targets passed to this macro are added to the lists tracking libgav1 targets:
+# cmake-format: off
+#   - Targets are always added to $libgav1_targets.
+#   - When the TEST flag is specified, targets are added to
+#     $libgav1_test_targets.
+#   - When TEST is not specified:
+#     - Libraries of type SHARED are added to $libgav1_dylib_targets.
+#     - Libraries of type OBJECT are added to $libgav1_objlib_targets.
+#     - Libraries of type STATIC are added to $libgav1_lib_targets.
+# cmake-format: on
+macro(libgav1_add_library)
+  unset(lib_TEST)
+  unset(lib_NAME)
+  unset(lib_OUTPUT_NAME)
+  unset(lib_TYPE)
+  unset(lib_SOURCES)
+  unset(lib_DEFINES)
+  unset(lib_INCLUDES)
+  unset(lib_COMPILE_FLAGS)
+  unset(lib_LINK_FLAGS)
+  unset(lib_OBJLIB_DEPS)
+  unset(lib_LIB_DEPS)
+  unset(lib_PUBLIC_INCLUDES)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME TYPE)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES)
+
+  cmake_parse_arguments(lib "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_library ---------\n"
+            "lib_TEST=${lib_TEST}\n"
+            "lib_NAME=${lib_NAME}\n"
+            "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n"
+            "lib_TYPE=${lib_TYPE}\n"
+            "lib_SOURCES=${lib_SOURCES}\n"
+            "lib_DEFINES=${lib_DEFINES}\n"
+            "lib_INCLUDES=${lib_INCLUDES}\n"
+            "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n"
+            "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n"
+            "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n"
+            "lib_LIB_DEPS=${lib_LIB_DEPS}\n"
+            "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n"
+            "---------------------------------------\n")
+  endif()
+
+  if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES))
+    message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${lib_NAME})
+  if(lib_TEST)
+    list(APPEND libgav1_test_targets ${lib_NAME})
+    list(APPEND libgav1_test_sources ${lib_SOURCES})
+  else()
+    list(APPEND libgav1_sources ${lib_SOURCES})
+    if(lib_TYPE STREQUAL OBJECT)
+      list(APPEND libgav1_objlib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL SHARED)
+      list(APPEND libgav1_dylib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL STATIC)
+      list(APPEND libgav1_lib_targets ${lib_NAME})
+    else()
+      message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}")
+    endif()
+  endif()
+
+  add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES})
+  libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES})
+
+  if(lib_OUTPUT_NAME)
+    if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32)))
+      set_target_properties(${lib_NAME}
+                            PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME})
+    endif()
+  endif()
+
+  if(lib_DEFINES)
+    target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES})
+  endif()
+
+  if(lib_INCLUDES)
+    target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES})
+  endif()
+
+  if(lib_PUBLIC_INCLUDES)
+    target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES})
+  endif()
+
+  if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+    target_compile_options(${lib_NAME}
+                           PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(lib_LINK_FLAGS)
+    set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS})
+  endif()
+
+  if(lib_OBJLIB_DEPS)
+    foreach(objlib_dep ${lib_OBJLIB_DEPS})
+      target_sources(${lib_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(lib_LIB_DEPS)
+    if(lib_TYPE STREQUAL STATIC)
+      set(link_type PUBLIC)
+    else()
+      set(link_type PRIVATE)
+      if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+        # The libgav1 shared object uses the static libgav1 as input to turn it
+        # into a shared object. Include everything from the static library in
+        # the shared object.
+        if(APPLE)
+          list(INSERT lib_LIB_DEPS 0 -Wl,-force_load)
+        else()
+          list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive)
+          list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive)
+        endif()
+      endif()
+    endif()
+    target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS})
+  endif()
+
+  if(NOT MSVC AND lib_NAME MATCHES "^lib")
+    # Non-MSVC generators prepend lib to static lib target file names. Libgav1
+    # already includes lib in its name. Avoid naming output files liblib*.
+    set_target_properties(${lib_NAME} PROPERTIES PREFIX "")
+  endif()
+
+  if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
+    set_target_properties(${lib_NAME}
+                          PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION
+                                     ${LIBGAV1_SOVERSION_MAJOR})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    if(lib_TYPE STREQUAL SHARED)
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1")
+    else()
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+    endif()
+  endif()
+
+  # Determine if $lib_NAME is a header only target.
+  set(sources_list ${lib_SOURCES})
+  list(FILTER sources_list INCLUDE REGEX cc$)
+  if(NOT sources_list)
+    if(NOT XCODE)
+      # This is a header only target. Tell CMake the link language.
+      set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+    else()
+      # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file.
+      libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME})
+    endif()
+  endif()
+
+  if(lib_TEST)
+    set_property(TARGET ${lib_NAME} PROPERTY FOLDER ${LIBGAV1_TESTS_IDE_FOLDER})
+  else()
+    set(sources_list ${lib_SOURCES})
+    list(FILTER sources_list INCLUDE REGEX examples)
+    if(sources_list)
+      set_property(TARGET ${lib_NAME}
+                   PROPERTY FOLDER ${LIBGAV1_EXAMPLES_IDE_FOLDER})
+    else()
+      set_property(TARGET ${lib_NAME} PROPERTY FOLDER ${LIBGAV1_IDE_FOLDER})
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_variables.cmake b/cmake/libgav1_variables.cmake
new file mode 100644 (file)
index 0000000..0dd0f37
--- /dev/null
@@ -0,0 +1,78 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1)
+
+# Halts generation when $variable_name does not refer to a directory that
+# exists.
+macro(libgav1_variable_must_be_directory variable_name)
+  if("${variable_name}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable_name passed to libgav1_variable_must_be_directory.")
+  endif()
+
+  if("${${variable_name}}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable ${variable_name} is required to build libgav1.")
+  endif()
+
+  if(NOT IS_DIRECTORY "${${variable_name}}")
+    message(
+      FATAL_ERROR
+        "${variable_name}, which is ${${variable_name}}, does not refer to a\n"
+        "directory.")
+  endif()
+endmacro()
+
+# Adds $var_name to the tracked variables list.
+macro(libgav1_track_configuration_variable var_name)
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("---- libgav1_track_configuration_variable ----\n"
+            "var_name=${var_name}\n"
+            "----------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_configuration_variables ${var_name})
+  list(REMOVE_DUPLICATES libgav1_configuration_variables)
+endmacro()
+
+# Logs current C++ and executable linker flags via CMake's message command.
+macro(libgav1_dump_cmake_flag_variables)
+  unset(flag_variables)
+  list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS"
+              "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS")
+  if(CMAKE_BUILD_TYPE)
+    list(APPEND flag_variables "CMAKE_BUILD_TYPE"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}")
+  endif()
+  foreach(flag_variable ${flag_variables})
+    message("${flag_variable}:${${flag_variable}}")
+  endforeach()
+endmacro()
+
+# Dumps the variables tracked in $libgav1_configuration_variables via CMake's
+# message command.
+macro(libgav1_dump_tracked_configuration_variables)
+  foreach(config_variable ${libgav1_configuration_variables})
+    message("${config_variable}:${${config_variable}}")
+  endforeach()
+endmacro()
diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake
new file mode 100644 (file)
index 0000000..fdcb012
--- /dev/null
@@ -0,0 +1,35 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS aarch64-linux-gnu-)
+endif()
+
+# For c_decoder_test.c and c_version_test.c.
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+set(CMAKE_C_FLAGS_INIT "-march=armv8-a")
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
diff --git a/cmake/toolchains/android.cmake b/cmake/toolchains/android.cmake
new file mode 100644 (file)
index 0000000..b550397
--- /dev/null
@@ -0,0 +1,53 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+
+# Additional ANDROID_* settings are available, see:
+# https://developer.android.com/ndk/guides/cmake#variables
+
+if(NOT ANDROID_PLATFORM)
+  set(ANDROID_PLATFORM android-21)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+  set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Force arm mode for 32-bit arm targets (instead of the default thumb) to
+# improve performance.
+if(ANDROID_ABI MATCHES "^armeabi" AND NOT ANDROID_ARM_MODE)
+  set(ANDROID_ARM_MODE arm)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(LIBGAV1_ANDROID_NDK_PATH)
+  set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}")
+else()
+  set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT LIBGAV1_ANDROID_NDK_PATH)
+  message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.")
+  return()
+endif()
+
+include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
new file mode 100644 (file)
index 0000000..7d58ce1
--- /dev/null
@@ -0,0 +1,39 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS arm-linux-gnueabihf-)
+endif()
+
+# For c_decoder_test.c and c_version_test.c.
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+# Note: -march=armv7-a+fp is an alternative to -mfpu with newer versions of
+# gcc:
+# https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=dff2abcbee65dbb4b7ca3ade0f7622ffdc0af391
+set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/codereview.settings b/codereview.settings
new file mode 100644 (file)
index 0000000..ccba2ee
--- /dev/null
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/examples/file_reader.cc b/examples/file_reader.cc
new file mode 100644 (file)
index 0000000..a01b7ab
--- /dev/null
@@ -0,0 +1,193 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <new>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/ivf_parser.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+}  // namespace
+
+bool FileReader::registered_in_factory_ =
+    FileReaderFactory::RegisterReader(FileReader::Open);
+
+FileReader::~FileReader() {
+  if (owns_file_) fclose(file_);
+}
+
+std::unique_ptr<FileReaderInterface> FileReader::Open(
+    const std::string& file_name, const bool error_tolerant) {
+  if (file_name.empty()) return nullptr;
+
+  FILE* raw_file_ptr;
+
+  bool owns_file = true;
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdin);
+    owns_file = false;  // stdin is owned by the Standard C Library.
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "rb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    return nullptr;
+  }
+
+  std::unique_ptr<FileReader> file(
+      new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    if (owns_file) fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (!file->ReadIvfFileHeader()) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type");
+    return nullptr;
+  }
+
+  // With C++11, to return |file|, an explicit move is required as the return
+  // type differs from the local variable. Overload resolution isn't guaranteed
+  // in this case, though some compilers may adopt the C++14 behavior (C++
+  // Standard Core Language Issue #1579, Return by converting move
+  // constructor):
+  // https://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1579
+  // To keep things simple we opt for the following compatible form.
+  return std::unique_ptr<FileReaderInterface>(file.release());
+}
+
+// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    size of frame in bytes (not including the 12-byte header)
+// bytes 4-11   64-bit presentation timestamp
+// bytes 12..   frame data
+bool FileReader::ReadTemporalUnit(std::vector<uint8_t>* const tu_data,
+                                  int64_t* const timestamp) {
+  if (tu_data == nullptr) return false;
+  tu_data->clear();
+
+  uint8_t header_buffer[kIvfFrameHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_);
+
+  if (IsEndOfFile()) {
+    if (num_read != 0) {
+      LIBGAV1_EXAMPLES_LOG_ERROR(
+          "Cannot read IVF frame header: Not enough data available");
+      return false;
+    }
+
+    return true;
+  }
+
+  IvfFrameHeader ivf_frame_header;
+  if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header");
+    if (error_tolerant_) {
+      ivf_frame_header.frame_size =
+          std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize});
+    } else {
+      return false;
+    }
+  }
+
+  if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp;
+
+  tu_data->resize(ivf_frame_header.frame_size);
+  const size_t size_read =
+      fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_);
+  if (size_read != ivf_frame_header.frame_size) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Unexpected EOF or I/O error reading frame data");
+    if (error_tolerant_) {
+      tu_data->resize(size_read);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Attempt to read an IVF file header. Returns true for success, and false for
+// failure.
+//
+// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    signature: 'DKIF'
+// bytes 4-5    version (should be 0)
+// bytes 6-7    length of header in bytes
+// bytes 8-11   codec FourCC (e.g., 'VP80')
+// bytes 12-13  width in pixels
+// bytes 14-15  height in pixels
+// bytes 16-19  frame rate
+// bytes 20-23  time scale
+// bytes 24-27  number of frames in file
+// bytes 28-31  unused
+//
+// Note: The rate and scale fields correspond to the numerator and denominator
+// of frame rate (fps) or time base (the reciprocal of frame rate) as follows:
+//
+// bytes 16-19  frame rate  timebase.den  framerate.numerator
+// bytes 20-23  time scale  timebase.num  framerate.denominator
+bool FileReader::ReadIvfFileHeader() {
+  uint8_t header_buffer[kIvfFileHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_);
+  if (num_read != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Cannot read IVF header: Not enough data available");
+    return false;
+  }
+
+  IvfFileHeader ivf_file_header;
+  if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header");
+    if (error_tolerant_) {
+      ivf_file_header = {};
+    } else {
+      return false;
+    }
+  }
+
+  width_ = ivf_file_header.width;
+  height_ = ivf_file_header.height;
+  frame_rate_ = ivf_file_header.frame_rate_numerator;
+  time_scale_ = ivf_file_header.frame_rate_denominator;
+  type_ = kFileTypeIvf;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader.h b/examples/file_reader.h
new file mode 100644 (file)
index 0000000..c342a20
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+// Temporal Unit based file reader class. Currently supports only IVF files.
+class FileReader : public FileReaderInterface {
+ public:
+  enum FileType {
+    kFileTypeUnknown,
+    kFileTypeIvf,
+  };
+
+  // Creates and returns a FileReader that reads from |file_name|.
+  // If |error_tolerant| is true format and read errors are ignored,
+  // ReadTemporalUnit() may return truncated data.
+  // Returns nullptr when the file does not exist, cannot be read, or is not an
+  // IVF file.
+  static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
+                                                   bool error_tolerant = false);
+
+  FileReader() = delete;
+  FileReader(const FileReader&) = delete;
+  FileReader& operator=(const FileReader&) = delete;
+
+  // Closes |file_|.
+  ~FileReader() override;
+
+  // Reads a temporal unit from |file_| and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp from the IVF frame header.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
+    return feof(file_) != 0;
+  }
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from the IVF file header.
+  size_t width() const override { return width_; }
+  size_t height() const override { return height_; }
+  size_t frame_rate() const override { return frame_rate_; }
+  size_t time_scale() const override { return time_scale_; }
+
+ private:
+  FileReader(FILE* file, bool owns_file, bool error_tolerant)
+      : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {}
+
+  bool ReadIvfFileHeader();
+
+  FILE* file_ = nullptr;
+  size_t width_ = 0;
+  size_t height_ = 0;
+  size_t frame_rate_ = 0;
+  size_t time_scale_ = 0;
+  FileType type_ = kFileTypeUnknown;
+  // True if this object owns file_ and is responsible for closing it when
+  // done.
+  const bool owns_file_;
+  const bool error_tolerant_;
+
+  static bool registered_in_factory_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_H_
diff --git a/examples/file_reader_constants.cc b/examples/file_reader_constants.cc
new file mode 100644 (file)
index 0000000..8439071
--- /dev/null
@@ -0,0 +1,23 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_constants.h"
+
+namespace libgav1 {
+
+const char kIvfSignature[4] = {'D', 'K', 'I', 'F'};
+const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'};
+const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'};
+
+}  // namespace libgav1
diff --git a/examples/file_reader_constants.h b/examples/file_reader_constants.h
new file mode 100644 (file)
index 0000000..00922b4
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+
+namespace libgav1 {
+
+enum {
+  kIvfHeaderVersion = 0,
+  kIvfFrameHeaderSize = 12,
+  kIvfFileHeaderSize = 32,
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  kMaxTemporalUnitSize = 512 * 1024,
+#else
+  kMaxTemporalUnitSize = 256 * 1024 * 1024,
+#endif
+};
+
+extern const char kIvfSignature[4];
+extern const char kAv1FourCcUpper[4];
+extern const char kAv1FourCcLower[4];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
diff --git a/examples/file_reader_factory.cc b/examples/file_reader_factory.cc
new file mode 100644 (file)
index 0000000..d5260eb
--- /dev/null
@@ -0,0 +1,51 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <new>
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+std::vector<FileReaderFactory::OpenFunction>* GetFileReaderOpenFunctions() {
+  static auto* open_functions =
+      new (std::nothrow) std::vector<FileReaderFactory::OpenFunction>();
+  return open_functions;
+}
+
+}  // namespace
+
+bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
+  if (open_function == nullptr) return false;
+  auto* open_functions = GetFileReaderOpenFunctions();
+  const size_t num_readers = open_functions->size();
+  open_functions->push_back(open_function);
+  return open_functions->size() == num_readers + 1;
+}
+
+std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
+    const std::string& file_name, const bool error_tolerant /*= false*/) {
+  for (auto* open_function : *GetFileReaderOpenFunctions()) {
+    auto reader = open_function(file_name, error_tolerant);
+    if (reader == nullptr) continue;
+    return reader;
+  }
+  LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input");
+  return nullptr;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader_factory.h b/examples/file_reader_factory.h
new file mode 100644 (file)
index 0000000..0f53484
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+class FileReaderFactory {
+ public:
+  using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
+      const std::string& file_name, bool error_tolerant);
+
+  FileReaderFactory() = delete;
+  FileReaderFactory(const FileReaderFactory&) = delete;
+  FileReaderFactory& operator=(const FileReaderFactory&) = delete;
+  ~FileReaderFactory() = default;
+
+  // Registers the OpenFunction for a FileReaderInterface and returns true when
+  // registration succeeds.
+  static bool RegisterReader(OpenFunction open_function);
+
+  // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr
+  // when no reader is found for |file_name|. Otherwise a FileReaderInterface is
+  // returned. If |error_tolerant| is true and the reader supports it, some
+  // format and read errors may be ignored and partial data returned.
+  static std::unique_ptr<FileReaderInterface> OpenReader(
+      const std::string& file_name, bool error_tolerant = false);
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
diff --git a/examples/file_reader_factory_test.cc b/examples/file_reader_factory_test.cc
new file mode 100644 (file)
index 0000000..346f9f8
--- /dev/null
@@ -0,0 +1,114 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "examples/file_reader_interface.h"
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+class AlwaysFailFileReader : public FileReaderInterface {
+ public:
+  static std::unique_ptr<FileReaderInterface> Open(
+      const std::string& /*file_name*/, bool /*error_tolerant*/) {
+    return nullptr;
+  }
+
+  AlwaysFailFileReader() = delete;
+  AlwaysFailFileReader(const AlwaysFailFileReader&) = delete;
+  AlwaysFailFileReader& operator=(const AlwaysFailFileReader&) = delete;
+  // Note this isn't overridden as the class can never be instantiated. This
+  // avoids an unused function warning.
+  // ~AlwaysFailFileReader() override = default;
+
+  bool ReadTemporalUnit(std::vector<uint8_t>* /*data*/,
+                        int64_t* /*pts*/) override {
+    return false;
+  }
+  bool IsEndOfFile() const override { return false; }
+
+  size_t width() const override { return 0; }
+  size_t height() const override { return 0; }
+  size_t frame_rate() const override { return 0; }
+  size_t time_scale() const override { return 0; }
+
+  static bool is_registered_;
+};
+
+class AlwaysOkFileReader : public FileReaderInterface {
+ public:
+  static std::unique_ptr<FileReaderInterface> Open(
+      const std::string& /*file_name*/, bool /*error_tolerant*/) {
+    auto reader = absl::WrapUnique(new (std::nothrow) AlwaysOkFileReader());
+
+    return reader;
+  }
+
+  AlwaysOkFileReader(const AlwaysOkFileReader&) = delete;
+  AlwaysOkFileReader& operator=(const AlwaysOkFileReader&) = delete;
+  ~AlwaysOkFileReader() override = default;
+
+  bool ReadTemporalUnit(std::vector<uint8_t>* /*data*/,
+                        int64_t* /*pts*/) override {
+    return true;
+  }
+  bool IsEndOfFile() const override { return true; }
+
+  size_t width() const override { return 1; }
+  size_t height() const override { return 1; }
+  size_t frame_rate() const override { return 1; }
+  size_t time_scale() const override { return 1; }
+
+  static bool is_registered_;
+
+ private:
+  AlwaysOkFileReader() = default;
+};
+
+bool AlwaysFailFileReader::is_registered_ =
+    FileReaderFactory::RegisterReader(AlwaysFailFileReader::Open);
+
+bool AlwaysOkFileReader::is_registered_ =
+    FileReaderFactory::RegisterReader(AlwaysOkFileReader::Open);
+
+TEST(FileReaderFactoryTest, RegistrationFail) {
+  EXPECT_FALSE(FileReaderFactory::RegisterReader(nullptr));
+}
+
+TEST(FileReaderFactoryTest, OpenReader) {
+  ASSERT_TRUE(AlwaysOkFileReader::is_registered_);
+  ASSERT_TRUE(AlwaysFailFileReader::is_registered_);
+
+  auto reader = FileReaderFactory::OpenReader("fake file");
+  EXPECT_NE(reader, nullptr);
+  EXPECT_TRUE(reader->IsEndOfFile());
+  EXPECT_TRUE(reader->ReadTemporalUnit(nullptr, nullptr));
+  EXPECT_EQ(reader->width(), 1);
+  EXPECT_EQ(reader->height(), 1);
+  EXPECT_EQ(reader->frame_rate(), 1);
+  EXPECT_EQ(reader->time_scale(), 1);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/examples/file_reader_interface.h b/examples/file_reader_interface.h
new file mode 100644 (file)
index 0000000..d8f7030
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace libgav1 {
+
+class FileReaderInterface {
+ public:
+  FileReaderInterface() = default;
+  FileReaderInterface(const FileReaderInterface&) = delete;
+  FileReaderInterface& operator=(const FileReaderInterface&) = delete;
+
+  FileReaderInterface(FileReaderInterface&&) = default;
+  FileReaderInterface& operator=(FileReaderInterface&&) = default;
+
+  // Closes the file.
+  virtual ~FileReaderInterface() = default;
+
+  // Reads a temporal unit from the file and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp of the temporal unit.
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from file.
+  virtual size_t width() const = 0;
+  virtual size_t height() const = 0;
+  virtual size_t frame_rate() const = 0;
+  virtual size_t time_scale() const = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
diff --git a/examples/file_reader_test.cc b/examples/file_reader_test.cc
new file mode 100644 (file)
index 0000000..53e27f7
--- /dev/null
@@ -0,0 +1,126 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+#include "examples/file_reader_test_common.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+// For use with tests that expect Open() failure to distinguish failure due to
+// the file contents versus failure due to a missing file.
+bool FileCanBeRead(const std::string& filename) {
+  FILE* const file = fopen(filename.c_str(), "r");
+  if (file != nullptr) {
+    fclose(file);
+    return true;
+  }
+  return false;
+}
+
+TEST(FileReaderTest, FailOpen) {
+  EXPECT_EQ(FileReader::Open(""), nullptr);
+  const std::string filename =
+      test_utils::GetTestInputFilePath("ivf-signature-only");
+  SCOPED_TRACE("Filename: " + filename);
+  EXPECT_TRUE(FileCanBeRead(filename));
+  EXPECT_EQ(FileReader::Open(filename), nullptr);
+}
+
+TEST(FileReaderTest, Open) {
+  const std::string filenames[] = {
+      test_utils::GetTestInputFilePath("five-frames.ivf"),
+      test_utils::GetTestInputFilePath("ivf-header-and-truncated-frame-header"),
+      test_utils::GetTestInputFilePath("ivf-header-only"),
+      test_utils::GetTestInputFilePath("one-frame-truncated.ivf"),
+      test_utils::GetTestInputFilePath("one-frame.ivf"),
+  };
+  for (const auto& filename : filenames) {
+    EXPECT_NE(FileReader::Open(filename), nullptr) << "Filename: " << filename;
+  }
+}
+
+TEST_P(FileReaderFailTest, FailRead) {
+  ASSERT_FALSE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+}
+
+TEST_P(FileReaderErrorTolerant, ReadThroughEndOfFile) {
+  while (!reader_->IsEndOfFile()) {
+    tu_data_.clear();
+    ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+    ASSERT_GT(tu_data_.size(), 0);
+  }
+}
+
+TEST_P(FileReaderTestNoTimeStamps, ReadThroughEndOfFile) {
+  while (!reader_->IsEndOfFile()) {
+    tu_data_.clear();
+    ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+  }
+}
+
+TEST_P(FileReaderTestWithTimeStamps, ReadThroughEndOfFile) {
+  int64_t timestamp = 0;
+  while (!reader_->IsEndOfFile()) {
+    tu_data_.clear();
+    ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, &timestamp));
+    if (!tu_data_.empty()) {
+      last_timestamp_ = timestamp;
+    }
+  }
+  ASSERT_TRUE(tu_data_.empty());
+  ASSERT_EQ(last_timestamp_, expected_last_timestamp_);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FailRead, FileReaderFailTest,
+    testing::Values(
+        FileReaderTestParameters(FileReader::Open,
+                                 "ivf-header-and-truncated-frame-header"),
+        FileReaderTestParameters(FileReader::Open, "one-frame-truncated.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(ReadThroughEndOfFile, FileReaderErrorTolerant,
+                         testing::Values(FileReaderTestParameters(
+                             FileReader::Open, "one-frame-truncated.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(
+    ReadThroughEndOfFile, FileReaderTestNoTimeStamps,
+    testing::Values(FileReaderTestParameters(FileReader::Open, "one-frame.ivf"),
+                    FileReaderTestParameters(FileReader::Open,
+                                             "one-frame-large-timestamp.ivf"),
+                    FileReaderTestParameters(FileReader::Open,
+                                             "five-frames.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(
+    ReadThroughEndOfFile, FileReaderTestWithTimeStamps,
+    testing::Values(
+        FileReaderTestWithTimeStampsParameters(FileReader::Open,
+                                               "one-frame.ivf", 0),
+        FileReaderTestWithTimeStampsParameters(FileReader::Open,
+                                               "one-frame-large-timestamp.ivf",
+                                               4294967296),
+        FileReaderTestWithTimeStampsParameters(FileReader::Open,
+                                               "five-frames.ivf", 4)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/examples/file_reader_test_common.cc b/examples/file_reader_test_common.cc
new file mode 100644 (file)
index 0000000..735dd9e
--- /dev/null
@@ -0,0 +1,43 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_test_common.h"
+
+#include <ostream>
+
+#include "examples/file_reader.h"
+
+namespace libgav1 {
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileReaderTestParameters& parameters) {
+  stream << "open_function="
+         << ((parameters.open_function == FileReader::Open) ? "FileReader"
+                                                            : "Unknown")
+         << ", file_name=" << parameters.file_name;
+  return stream;
+}
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const FileReaderTestWithTimeStampsParameters& parameters) {
+  stream << "open_function="
+         << ((parameters.open_function == FileReader::Open) ? "FileReader"
+                                                            : "Unknown")
+         << ", file_name=" << parameters.file_name
+         << ", expected_last_timestamp=" << parameters.expected_last_timestamp;
+  return stream;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader_test_common.h b/examples/file_reader_test_common.h
new file mode 100644 (file)
index 0000000..187a6ac
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+
+struct FileReaderTestParameters {
+  FileReaderTestParameters() = default;
+  FileReaderTestParameters(FileReaderFactory::OpenFunction open_function,
+                           const char* file_name)
+      : open_function(open_function), file_name(file_name) {}
+  FileReaderTestParameters(const FileReaderTestParameters&) = default;
+  FileReaderTestParameters& operator=(const FileReaderTestParameters&) = delete;
+  FileReaderTestParameters(FileReaderTestParameters&&) = default;
+  FileReaderTestParameters& operator=(FileReaderTestParameters&&) = default;
+  ~FileReaderTestParameters() = default;
+
+  FileReaderFactory::OpenFunction open_function = nullptr;
+  const char* file_name = nullptr;
+};
+
+class FileReaderTestBase {
+ public:
+  FileReaderTestBase() = default;
+  FileReaderTestBase(const FileReaderTestBase&) = delete;
+  FileReaderTestBase& operator=(const FileReaderTestBase&) = delete;
+  FileReaderTestBase(FileReaderTestBase&&) = default;
+  FileReaderTestBase& operator=(FileReaderTestBase&&) = default;
+  ~FileReaderTestBase() = default;
+
+ protected:
+  void OpenReader(const char* file_name,
+                  FileReaderFactory::OpenFunction open_function) {
+    file_name_ = test_utils::GetTestInputFilePath(file_name);
+    reader_ = open_function(file_name_, /*error_tolerant=*/false);
+    ASSERT_NE(reader_, nullptr);
+  }
+
+  std::string file_name_;
+  std::unique_ptr<FileReaderInterface> reader_;
+  std::vector<uint8_t> tu_data_;
+};
+
+class FileReaderFailTest
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+  FileReaderFailTest() = default;
+  FileReaderFailTest(const FileReaderTestBase&) = delete;
+  FileReaderFailTest& operator=(const FileReaderTestBase&) = delete;
+  ~FileReaderFailTest() override = default;
+
+ protected:
+  void SetUp() override {
+    OpenReader(GetParam().file_name, GetParam().open_function);
+  }
+};
+
+class FileReaderTestNoTimeStamps
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+  FileReaderTestNoTimeStamps() = default;
+  FileReaderTestNoTimeStamps(const FileReaderTestNoTimeStamps&) = delete;
+  FileReaderTestNoTimeStamps& operator=(const FileReaderTestNoTimeStamps&) =
+      delete;
+  ~FileReaderTestNoTimeStamps() override = default;
+
+ protected:
+  void SetUp() override {
+    OpenReader(GetParam().file_name, GetParam().open_function);
+  }
+};
+
+class FileReaderErrorTolerant
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+  FileReaderErrorTolerant() = default;
+  FileReaderErrorTolerant(const FileReaderErrorTolerant&) = delete;
+  FileReaderErrorTolerant& operator=(const FileReaderErrorTolerant&) = delete;
+  ~FileReaderErrorTolerant() override = default;
+
+ protected:
+  void SetUp() override {
+    file_name_ = test_utils::GetTestInputFilePath(GetParam().file_name);
+    reader_ = GetParam().open_function(file_name_, /*error_tolerant=*/true);
+    ASSERT_NE(reader_, nullptr);
+  }
+};
+
+struct FileReaderTestWithTimeStampsParameters {
+  FileReaderTestWithTimeStampsParameters() = default;
+  FileReaderTestWithTimeStampsParameters(
+      FileReaderFactory::OpenFunction open_function, const char* file_name,
+      int64_t expected_last_timestamp)
+      : open_function(open_function),
+        file_name(file_name),
+        expected_last_timestamp(expected_last_timestamp) {}
+  FileReaderTestWithTimeStampsParameters(
+      const FileReaderTestWithTimeStampsParameters&) = default;
+  FileReaderTestWithTimeStampsParameters& operator=(
+      const FileReaderTestWithTimeStampsParameters&) = delete;
+  FileReaderTestWithTimeStampsParameters(
+      FileReaderTestWithTimeStampsParameters&&) = default;
+  FileReaderTestWithTimeStampsParameters& operator=(
+      FileReaderTestWithTimeStampsParameters&&) = default;
+  ~FileReaderTestWithTimeStampsParameters() = default;
+
+  FileReaderFactory::OpenFunction open_function = nullptr;
+  const char* file_name = nullptr;
+  int64_t expected_last_timestamp = 0;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileReaderTestParameters& parameters);
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const FileReaderTestWithTimeStampsParameters& parameters);
+
+class FileReaderTestWithTimeStamps
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestWithTimeStampsParameters> {
+ public:
+  FileReaderTestWithTimeStamps() = default;
+  FileReaderTestWithTimeStamps(const FileReaderTestWithTimeStamps&) = delete;
+  FileReaderTestWithTimeStamps& operator=(const FileReaderTestWithTimeStamps&) =
+      delete;
+  ~FileReaderTestWithTimeStamps() override = default;
+
+ protected:
+  void SetUp() override {
+    FileReaderTestWithTimeStampsParameters parameters = GetParam();
+    OpenReader(parameters.file_name, parameters.open_function);
+    expected_last_timestamp_ = parameters.expected_last_timestamp;
+  }
+
+  int64_t last_timestamp_ = 0;
+  int64_t expected_last_timestamp_ = 0;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
diff --git a/examples/file_writer.cc b/examples/file_writer.cc
new file mode 100644 (file)
index 0000000..54afe14
--- /dev/null
@@ -0,0 +1,183 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <string>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+std::string GetY4mColorSpaceString(
+    const FileWriter::Y4mParameters& y4m_parameters) {
+  std::string color_space_string;
+  switch (y4m_parameters.image_format) {
+    case kImageFormatMonochrome400:
+      color_space_string = "mono";
+      break;
+    case kImageFormatYuv420:
+      if (y4m_parameters.bitdepth == 8) {
+        if (y4m_parameters.chroma_sample_position ==
+            kChromaSamplePositionVertical) {
+          color_space_string = "420mpeg2";
+        } else if (y4m_parameters.chroma_sample_position ==
+                   kChromaSamplePositionColocated) {
+          color_space_string = "420";
+        } else {
+          color_space_string = "420jpeg";
+        }
+      } else {
+        color_space_string = "420";
+      }
+      break;
+    case kImageFormatYuv422:
+      color_space_string = "422";
+      break;
+    case kImageFormatYuv444:
+      color_space_string = "444";
+      break;
+  }
+
+  if (y4m_parameters.bitdepth > 8) {
+    const bool monochrome =
+        y4m_parameters.image_format == kImageFormatMonochrome400;
+    if (!monochrome) color_space_string += "p";
+    color_space_string += std::to_string(y4m_parameters.bitdepth);
+  }
+
+  return color_space_string;
+}
+
+}  // namespace
+
+FileWriter::~FileWriter() { fclose(file_); }
+
+std::unique_ptr<FileWriter> FileWriter::Open(
+    const std::string& file_name, FileType file_type,
+    const Y4mParameters* const y4m_parameters) {
+  if (file_name.empty() ||
+      (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
+      (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters");
+    return nullptr;
+  }
+
+  FILE* raw_file_ptr;
+
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdout);
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "wb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file");
+    return nullptr;
+  }
+
+  std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header");
+    return nullptr;
+  }
+
+  file->file_type_ = file_type;
+  return file;
+}
+
+bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
+  if (file_type_ == kFileTypeY4m) {
+    const char kY4mFrameHeader[] = "FRAME\n";
+    if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) !=
+        strlen(kY4mFrameHeader)) {
+      LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header");
+      return false;
+    }
+  }
+
+  const size_t pixel_size =
+      (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  for (int plane_index = 0; plane_index < frame_buffer.NumPlanes();
+       ++plane_index) {
+    const int height = frame_buffer.displayed_height[plane_index];
+    const int width = frame_buffer.displayed_width[plane_index];
+    const int stride = frame_buffer.stride[plane_index];
+    const uint8_t* const plane_pointer = frame_buffer.plane[plane_index];
+    for (int row = 0; row < height; ++row) {
+      const uint8_t* const row_pointer = &plane_pointer[row * stride];
+      if (fwrite(row_pointer, pixel_size, width, file_) !=
+          static_cast<size_t>(width)) {
+        char error_string[256];
+        snprintf(error_string, sizeof(error_string),
+                 "File write failed: %s (errno=%d)", strerror(errno), errno);
+        LIBGAV1_EXAMPLES_LOG_ERROR(error_string);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Writes Y4M file header to |file_| and returns true when successful.
+//
+// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '.
+//
+// Following the signature is any number of optional parameters preceded by a
+// space. We always write:
+//
+// Width: 'W' followed by image width in pixels.
+// Height: 'H' followed by image height in pixels.
+// Frame Rate: 'F' followed frames/second in the form numerator:denominator.
+// Interlacing: 'I' followed by 'p' for progressive.
+// Color space: 'C' followed by a string representation of the color space.
+//
+// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
+bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
+  std::string y4m_header = "YUV4MPEG2";
+  y4m_header += " W" + std::to_string(y4m_parameters.width);
+  y4m_header += " H" + std::to_string(y4m_parameters.height);
+  y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+                ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+  y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+  y4m_header += "\n";
+  return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
+         y4m_header.length();
+}
+
+}  // namespace libgav1
diff --git a/examples/file_writer.h b/examples/file_writer.h
new file mode 100644 (file)
index 0000000..00f6cc3
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_
+#define LIBGAV1_EXAMPLES_FILE_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output.
+class FileWriter {
+ public:
+  enum FileType : uint8_t {
+    kFileTypeRaw,
+    kFileTypeY4m,
+  };
+
+  struct Y4mParameters {
+    Y4mParameters() = default;
+    Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator,
+                  size_t frame_rate_denominator,
+                  ChromaSamplePosition chroma_sample_position,
+                  ImageFormat image_format, size_t bitdepth)
+        : width(width),
+          height(height),
+          frame_rate_numerator(frame_rate_numerator),
+          frame_rate_denominator(frame_rate_denominator),
+          chroma_sample_position(chroma_sample_position),
+          image_format(image_format),
+          bitdepth(bitdepth) {}
+
+    Y4mParameters(const Y4mParameters& rhs) = default;
+    Y4mParameters& operator=(const Y4mParameters& rhs) = default;
+    Y4mParameters(Y4mParameters&& rhs) = default;
+    Y4mParameters& operator=(Y4mParameters&& rhs) = default;
+
+    size_t width = 0;
+    size_t height = 0;
+    size_t frame_rate_numerator = 30;
+    size_t frame_rate_denominator = 1;
+    ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+    ImageFormat image_format = kImageFormatYuv420;
+    size_t bitdepth = 8;
+  };
+
+  // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is
+  // written out to |file_| before this method returns.
+  //
+  // Returns a FileWriter instance after the file is opened successfully for
+  // kFileTypeRaw files, and after the Y4M file header bytes are written for
+  // kFileTypeY4m files. Returns nullptr upon failure.
+  static std::unique_ptr<FileWriter> Open(const std::string& file_name,
+                                          FileType type,
+                                          const Y4mParameters* y4m_parameters);
+
+  FileWriter() = delete;
+  FileWriter(const FileWriter&) = delete;
+  FileWriter& operator=(const FileWriter&) = delete;
+
+  FileWriter(FileWriter&&) = default;
+  FileWriter& operator=(FileWriter&&) = default;
+
+  // Closes |file_|.
+  ~FileWriter();
+
+  // Writes the frame data in |frame_buffer| to |file_|. Returns true after
+  // successful write of |frame_buffer| data.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+      const DecoderBuffer& frame_buffer);
+
+ private:
+  explicit FileWriter(FILE* file) : file_(file) {}
+
+  bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters);
+
+  FILE* file_ = nullptr;
+  FileType file_type_ = kFileTypeRaw;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_WRITER_H_
diff --git a/examples/file_writer_test.cc b/examples/file_writer_test.cc
new file mode 100644 (file)
index 0000000..df5be17
--- /dev/null
@@ -0,0 +1,496 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "gav1/decoder_buffer.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+const char kExpectedY4mHeader8bit[] = "YUV4MPEG2 W352 H288 F30:1 Ip C420jpeg\n";
+const char kExpectedY4mHeader10bit[] = "YUV4MPEG2 W352 H288 F30:1 Ip C420p10\n";
+const char kExpectedY4mHeader8bitMonochrome[] =
+    "YUV4MPEG2 W352 H288 F30:1 Ip Cmono\n";
+const char kExpectedY4mHeader10bitMonochrome[] =
+    "YUV4MPEG2 W352 H288 F30:1 Ip Cmono10\n";
+
+// Note: These are non-const because DecoderBuffer.plane is non-const.
+char fake_plane0[] = "PLANE0\n";
+char fake_plane1[] = "PLANE1\n";
+char fake_plane2[] = "PLANE2\n";
+
+constexpr size_t kExpectedRawDataBufferCount = 3;
+const char* kExpectedRawData[kExpectedRawDataBufferCount] = {
+    fake_plane0, fake_plane1, fake_plane2};
+
+const char* const kExpectedRawDataMonochrome = fake_plane0;
+
+constexpr size_t kExpectedY4mDataBufferCount = 5;
+const char* const kExpectedY4mFileData8bit[kExpectedY4mDataBufferCount] = {
+    kExpectedY4mHeader8bit, "FRAME\n", fake_plane0, fake_plane1, fake_plane2};
+const char* const kExpectedY4mFileData10bit[kExpectedY4mDataBufferCount] = {
+    kExpectedY4mHeader10bit, "FRAME\n", fake_plane0, fake_plane1, fake_plane2};
+
+constexpr size_t kExpectedY4mDataBufferCountMonochrome = 3;
+const char* const
+    kExpectedY4mFileData8bitMonochrome[kExpectedY4mDataBufferCountMonochrome] =
+        {kExpectedY4mHeader8bitMonochrome, "FRAME\n", fake_plane0};
+const char* const
+    kExpectedY4mFileData10bitMonochrome[kExpectedY4mDataBufferCountMonochrome] =
+        {kExpectedY4mHeader10bitMonochrome, "FRAME\n", fake_plane0};
+
+// TODO(tomfinegan): Add a bitdepth arg, and test writing 10 bit frame buffers.
+std::unique_ptr<DecoderBuffer> GetFakeDecoderBuffer(ImageFormat image_format) {
+  auto buffer = absl::WrapUnique(new (std::nothrow) DecoderBuffer);
+  if (buffer == nullptr) return nullptr;
+  buffer->chroma_sample_position = kChromaSamplePositionUnknown;
+  buffer->image_format = image_format;
+  buffer->bitdepth = 8;
+  buffer->displayed_width[0] = static_cast<int>(strlen(fake_plane0));
+  buffer->displayed_width[1] = static_cast<int>(strlen(fake_plane1));
+  buffer->displayed_width[2] = static_cast<int>(strlen(fake_plane2));
+  buffer->displayed_height[0] = 1;
+  buffer->displayed_height[1] = 1;
+  buffer->displayed_height[2] = 1;
+  buffer->stride[0] = static_cast<int>(strlen(fake_plane0));
+  buffer->stride[1] = static_cast<int>(strlen(fake_plane1));
+  buffer->stride[2] = static_cast<int>(strlen(fake_plane2));
+  buffer->plane[0] = reinterpret_cast<uint8_t*>(fake_plane0);
+  buffer->plane[1] = reinterpret_cast<uint8_t*>(fake_plane1);
+  buffer->plane[2] = reinterpret_cast<uint8_t*>(fake_plane2);
+  buffer->user_private_data = 0;
+  buffer->buffer_private_data = nullptr;
+  return buffer;
+}
+
+TEST(FileWriterTest, FailOpen) {
+  EXPECT_EQ(FileWriter::Open(test_utils::GetTestOutputFilePath("fail_open"),
+                             static_cast<FileWriter::FileType>(3), nullptr),
+            nullptr);
+  EXPECT_EQ(FileWriter::Open(test_utils::GetTestOutputFilePath("fail_open"),
+                             FileWriter::kFileTypeY4m, nullptr),
+            nullptr);
+}
+
+struct FileWriterY4mHeaderTestParameters {
+  FileWriterY4mHeaderTestParameters() = default;
+  FileWriterY4mHeaderTestParameters(const FileWriterY4mHeaderTestParameters&) =
+      default;
+  FileWriterY4mHeaderTestParameters& operator=(
+      const FileWriterY4mHeaderTestParameters&) = default;
+  FileWriterY4mHeaderTestParameters(FileWriterY4mHeaderTestParameters&&) =
+      default;
+  FileWriterY4mHeaderTestParameters& operator=(
+      FileWriterY4mHeaderTestParameters&&) = default;
+  ~FileWriterY4mHeaderTestParameters() = default;
+
+  FileWriterY4mHeaderTestParameters(std::string file_name,
+                                    ChromaSamplePosition chroma_sample_position,
+                                    ImageFormat image_format, int bitdepth,
+                                    const char* expected_header_string)
+      : file_name(std::move(file_name)),
+        chroma_sample_position(chroma_sample_position),
+        image_format(image_format),
+        bitdepth(bitdepth),
+        expected_header_string(expected_header_string) {}
+  std::string file_name;
+  ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+  ImageFormat image_format = kImageFormatMonochrome400;
+  int bitdepth = 8;
+  const char* expected_header_string = nullptr;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileWriterY4mHeaderTestParameters& parameters) {
+  stream << "file_name=" << parameters.file_name << "\n"
+         << "chroma_sample_position=" << parameters.chroma_sample_position
+         << "\n"
+         << "image_format=" << parameters.image_format << "\n"
+         << "bitdepth=" << parameters.bitdepth << "\n"
+         << "expected_header_string=" << parameters.expected_header_string
+         << "\n";
+  return stream;
+}
+
+class FileWriterY4mHeaderTest
+    : public testing::TestWithParam<FileWriterY4mHeaderTestParameters> {
+ public:
+  FileWriterY4mHeaderTest() {
+    test_parameters_ = GetParam();
+    y4m_parameters_.width = 352;
+    y4m_parameters_.height = 288;
+    y4m_parameters_.frame_rate_numerator = 30;
+    y4m_parameters_.frame_rate_denominator = 1;
+    y4m_parameters_.chroma_sample_position =
+        test_parameters_.chroma_sample_position;
+    y4m_parameters_.image_format = test_parameters_.image_format;
+    y4m_parameters_.bitdepth = test_parameters_.bitdepth;
+  }
+  FileWriterY4mHeaderTest(const FileWriterY4mHeaderTest&) = delete;
+  FileWriterY4mHeaderTest& operator=(const FileWriterY4mHeaderTest&) = delete;
+  ~FileWriterY4mHeaderTest() override = default;
+
+ protected:
+  FileWriterY4mHeaderTestParameters test_parameters_;
+  FileWriter::Y4mParameters y4m_parameters_;
+};
+
+TEST_P(FileWriterY4mHeaderTest, WriteY4mHeader) {
+  const std::string file_name =
+      test_utils::GetTestOutputFilePath(test_parameters_.file_name);
+  EXPECT_NE(
+      FileWriter::Open(file_name, FileWriter::kFileTypeY4m, &y4m_parameters_),
+      nullptr);
+  std::string y4m_header_string;
+  test_utils::GetTestData(test_parameters_.file_name, true, &y4m_header_string);
+  EXPECT_STREQ(y4m_header_string.c_str(),
+               test_parameters_.expected_header_string);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    WriteY4mHeader, FileWriterY4mHeaderTest,
+    testing::Values(
+        FileWriterY4mHeaderTestParameters(
+            "y4m_header_8bit", kChromaSamplePositionUnknown, kImageFormatYuv420,
+            /*bitdepth=*/8, kExpectedY4mHeader8bit),
+        FileWriterY4mHeaderTestParameters("y4m_header_10bit",
+                                          kChromaSamplePositionUnknown,
+                                          kImageFormatYuv420, /*bitdepth=*/10,
+                                          kExpectedY4mHeader10bit),
+        FileWriterY4mHeaderTestParameters("y4m_header_8bit_monochrome",
+                                          kChromaSamplePositionUnknown,
+                                          kImageFormatMonochrome400,
+                                          /*bitdepth=*/8,
+                                          kExpectedY4mHeader8bitMonochrome),
+        FileWriterY4mHeaderTestParameters("y4m_header_10bit_monochrome",
+                                          kChromaSamplePositionUnknown,
+                                          kImageFormatMonochrome400,
+                                          /*bitdepth=*/10,
+                                          kExpectedY4mHeader10bitMonochrome)));
+
+struct FileWriterTestParameters {
+  FileWriterTestParameters() = default;
+  FileWriterTestParameters(const FileWriterTestParameters&) = default;
+  FileWriterTestParameters& operator=(const FileWriterTestParameters&) =
+      default;
+  FileWriterTestParameters(FileWriterTestParameters&&) = default;
+  FileWriterTestParameters& operator=(FileWriterTestParameters&&) = default;
+  ~FileWriterTestParameters() = default;
+
+  FileWriterTestParameters(std::string file_name,
+                           FileWriter::FileType file_type,
+                           const FileWriter::Y4mParameters* y4m_parameters,
+                           size_t num_frames)
+      : file_name(std::move(file_name)),
+        file_type(file_type),
+        y4m_parameters(y4m_parameters),
+        num_frames(num_frames) {}
+  std::string file_name;
+  FileWriter::FileType file_type = FileWriter::kFileTypeRaw;
+  const FileWriter::Y4mParameters* y4m_parameters = nullptr;
+  size_t num_frames = 1;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+                         const ChromaSamplePosition& position) {
+  switch (position) {
+    case kChromaSamplePositionUnknown:
+      stream << "kCromaSamplePositionUnknown";
+      break;
+    case kChromaSamplePositionVertical:
+      stream << "kChromaSamplePositionVertical";
+      break;
+    case kChromaSamplePositionColocated:
+      stream << "kChromaSamplePositionColocated";
+      break;
+    case kChromaSamplePositionReserved:
+      stream << "kChromaSamplePositionReserved";
+      break;
+  }
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+                         const ImageFormat& image_format) {
+  switch (image_format) {
+    case kImageFormatMonochrome400:
+      stream << "kImageFormatMonochrome400";
+      break;
+    case kImageFormatYuv420:
+      stream << "kImageFormatYuv420";
+      break;
+    case kImageFormatYuv422:
+      stream << "kImageFormatYuv422";
+      break;
+    case kImageFormatYuv444:
+      stream << "kImageFormatYuv444";
+      break;
+  }
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileWriter::Y4mParameters& parameters) {
+  stream << "y4m_parameters:\n"
+         << "  width=" << parameters.width << "\n"
+         << "  height=" << parameters.height << "\n"
+         << "  frame_rate_numerator=" << parameters.frame_rate_numerator << "\n"
+         << "  frame_rate_denominator=" << parameters.frame_rate_denominator
+         << "\n"
+         << "  chroma_sample_position=" << parameters.chroma_sample_position
+         << "\n"
+         << "  image_format=" << parameters.image_format << "\n"
+         << "  bitdepth=" << parameters.bitdepth << "\n";
+
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileWriterTestParameters& parameters) {
+  stream << "file_name=" << parameters.file_name << "\n"
+         << "file_type="
+         << (parameters.file_type == FileWriter::kFileTypeRaw ? "kFileTypeRaw"
+                                                              : "kFileTypeY4m")
+         << "\n";
+  if (parameters.y4m_parameters != nullptr) {
+    stream << *parameters.y4m_parameters;
+  } else {
+    stream << "y4m_parameters: <nullptr>\n";
+  }
+  stream << "num_frames=" << parameters.num_frames << "\n";
+  return stream;
+}
+
+class FileWriterTestBase
+    : public testing::TestWithParam<FileWriterTestParameters> {
+ public:
+  FileWriterTestBase() = default;
+  FileWriterTestBase(const FileWriterTestBase&) = delete;
+  FileWriterTestBase& operator=(const FileWriterTestBase&) = delete;
+  ~FileWriterTestBase() override = default;
+
+ protected:
+  void SetUp() override { OpenWriter(GetParam()); }
+
+  void OpenWriter(const FileWriterTestParameters& parameters) {
+    parameters_ = parameters;
+    parameters_.file_name = parameters.file_name;
+    file_writer_ = FileWriter::Open(
+        test_utils::GetTestOutputFilePath(parameters.file_name),
+        parameters_.file_type, parameters_.y4m_parameters);
+    ASSERT_NE(file_writer_, nullptr);
+  }
+
+  void WriteFramesAndCloseFile() {
+    if (parameters_.y4m_parameters != nullptr) {
+      image_format_ = parameters_.y4m_parameters->image_format;
+    }
+    decoder_buffer_ = GetFakeDecoderBuffer(image_format_);
+    for (size_t frame_num = 0; frame_num < parameters_.num_frames;
+         ++frame_num) {
+      ASSERT_TRUE(file_writer_->WriteFrame(*decoder_buffer_));
+    }
+    file_writer_ = nullptr;
+  }
+
+  ImageFormat image_format_ = kImageFormatYuv420;
+  FileWriterTestParameters parameters_;
+  std::unique_ptr<FileWriter> file_writer_;
+  std::unique_ptr<DecoderBuffer> decoder_buffer_;
+};
+
+class FileWriterTestRaw : public FileWriterTestBase {
+ public:
+  FileWriterTestRaw() = default;
+  FileWriterTestRaw(const FileWriterTestRaw&) = delete;
+  FileWriterTestRaw& operator=(const FileWriterTestRaw&) = delete;
+  ~FileWriterTestRaw() override = default;
+
+ protected:
+  void SetUp() override { FileWriterTestBase::SetUp(); }
+};
+
+class FileWriterTestY4m : public FileWriterTestBase {
+ public:
+  FileWriterTestY4m() = default;
+  FileWriterTestY4m(const FileWriterTestY4m&) = delete;
+  FileWriterTestY4m& operator=(const FileWriterTestY4m&) = delete;
+  ~FileWriterTestY4m() override = default;
+
+ protected:
+  void SetUp() override { FileWriterTestBase::SetUp(); }
+};
+
+TEST_P(FileWriterTestRaw, WriteRawFrames) {
+  WriteFramesAndCloseFile();
+
+  std::string actual_file_data;
+  test_utils::GetTestData(parameters_.file_name, true, &actual_file_data);
+
+  std::string expected_file_data;
+  for (size_t frame_num = 0; frame_num < parameters_.num_frames; ++frame_num) {
+    if (image_format_ == kImageFormatMonochrome400) {
+      expected_file_data += kExpectedRawDataMonochrome;
+    } else {
+      for (const auto& buffer : kExpectedRawData) {
+        expected_file_data += buffer;
+      }
+    }
+  }
+
+  ASSERT_EQ(actual_file_data, expected_file_data);
+}
+
+TEST_P(FileWriterTestY4m, WriteY4mFrames) {
+  WriteFramesAndCloseFile();
+
+  std::string actual_file_data;
+  test_utils::GetTestData(parameters_.file_name, true, &actual_file_data);
+
+  std::string expected_file_data;
+  for (size_t frame_num = 0; frame_num < parameters_.num_frames; ++frame_num) {
+    if (image_format_ == kImageFormatMonochrome400) {
+      const char* const* expected_data_planes =
+          (parameters_.y4m_parameters->bitdepth == 8)
+              ? kExpectedY4mFileData8bitMonochrome
+              : kExpectedY4mFileData10bitMonochrome;
+      // Skip the Y4M file header "plane" after frame 0.
+      for (size_t buffer_num = (frame_num == 0) ? 0 : 1;
+           buffer_num < kExpectedY4mDataBufferCountMonochrome; ++buffer_num) {
+        expected_file_data += expected_data_planes[buffer_num];
+      }
+    } else {
+      const char* const* expected_data_planes =
+          (parameters_.y4m_parameters->bitdepth == 8)
+              ? kExpectedY4mFileData8bit
+              : kExpectedY4mFileData10bit;
+
+      // Skip the Y4M file header "plane" after frame 0.
+      for (size_t buffer_num = (frame_num == 0) ? 0 : 1;
+           buffer_num < kExpectedY4mDataBufferCount; ++buffer_num) {
+        expected_file_data += expected_data_planes[buffer_num];
+      }
+    }
+  }
+
+  ASSERT_EQ(actual_file_data, expected_file_data);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    WriteRawFrames, FileWriterTestRaw,
+    testing::Values(
+        FileWriterTestParameters("raw_frames_test_1frame",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("raw_frames_test_5frames",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("raw_frames_test_1frame_monochrome",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("raw_frames_test_5frames_monochrome",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/5)));
+
+const FileWriter::Y4mParameters kY4mParameters8Bit = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatYuv420,
+    8  // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters10Bit = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatYuv420,
+    10  // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters8BitMonochrome = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatMonochrome400,
+    8  // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters10BitMonochrome = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatMonochrome400,
+    10  // bitdepth
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    WriteY4mFrames, FileWriterTestY4m,
+    testing::Values(
+        FileWriterTestParameters("y4m_frames_test_8bit_1frame",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters8Bit,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_8bit_5frames",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters8Bit,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("y4m_frames_test_10bit_1frame",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters10Bit,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_10bit_5frames",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters10Bit,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("y4m_frames_test_8bit_1frame_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters8BitMonochrome,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_8bit_5frames_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters8BitMonochrome,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("y4m_frames_test_10bit_1frame_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters10BitMonochrome,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_10bit_5frames_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters10BitMonochrome,
+                                 /*num_frames=*/5)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc
new file mode 100644 (file)
index 0000000..1408e8c
--- /dev/null
@@ -0,0 +1,455 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/file_writer.h"
+#include "gav1/decoder.h"
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+#endif
+
+namespace {
+
+struct Options {
+  const char* input_file_name = nullptr;
+  const char* output_file_name = nullptr;
+  const char* frame_timing_file_name = nullptr;
+  libgav1::FileWriter::FileType output_file_type =
+      libgav1::FileWriter::kFileTypeRaw;
+  uint8_t post_filter_mask = 0x1f;
+  int threads = 1;
+  bool frame_parallel = false;
+  bool output_all_layers = false;
+  int operating_point = 0;
+  int limit = 0;
+  int skip = 0;
+  int verbose = 0;
+};
+
+struct Timing {
+  absl::Duration input;
+  absl::Duration dequeue;
+};
+
+struct FrameTiming {
+  absl::Time enqueue;
+  absl::Time dequeue;
+};
+
+void PrintHelp(FILE* const fout) {
+  fprintf(fout,
+          "Usage: gav1_decode [options] <input file>"
+          " [-o <output file>]\n");
+  fprintf(fout, "\n");
+  fprintf(fout, "Options:\n");
+  fprintf(fout, "  -h, --help This help message.\n");
+  fprintf(fout, "  --threads <positive integer> (Default 1).\n");
+  fprintf(fout, "  --frame_parallel.\n");
+  fprintf(fout,
+          "  --limit <integer> Stop decoding after N frames (0 = all).\n");
+  fprintf(fout, "  --skip <integer> Skip initial N frames (Default 0).\n");
+  fprintf(fout, "  --version.\n");
+  fprintf(fout, "  --y4m (Default false).\n");
+  fprintf(fout, "  --raw (Default true).\n");
+  fprintf(fout, "  -v logging verbosity, can be used multiple times.\n");
+  fprintf(fout, "  --all_layers.\n");
+  fprintf(fout,
+          "  --operating_point <integer between 0 and 31> (Default 0).\n");
+  fprintf(fout,
+          "  --frame_timing <file> Output per-frame timing to <file> in tsv"
+          " format.\n   Yields meaningful results only when frame parallel is"
+          " off.\n");
+  fprintf(fout, "\nAdvanced settings:\n");
+  fprintf(fout, "  --post_filter_mask <integer> (Default 0x1f).\n");
+  fprintf(fout,
+          "   Mask indicating which post filters should be applied to the"
+          " reconstructed\n   frame. This may be given as octal, decimal or"
+          " hexadecimal. From LSB:\n");
+  fprintf(fout, "     Bit 0: Loop filter (deblocking filter)\n");
+  fprintf(fout, "     Bit 1: Cdef\n");
+  fprintf(fout, "     Bit 2: SuperRes\n");
+  fprintf(fout, "     Bit 3: Loop Restoration\n");
+  fprintf(fout, "     Bit 4: Film Grain Synthesis\n");
+}
+
+void ParseOptions(int argc, char* argv[], Options* const options) {
+  for (int i = 1; i < argc; ++i) {
+    int32_t value;
+    if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+      PrintHelp(stdout);
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-o") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '-o'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->output_file_name = argv[i];
+    } else if (strcmp(argv[i], "--frame_timing") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '--frame_timing'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->frame_timing_file_name = argv[i];
+    } else if (strcmp(argv[i], "--version") == 0) {
+      printf("gav1_decode, a libgav1 based AV1 decoder\n");
+      printf("libgav1 %s\n", libgav1::GetVersionString());
+      printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth());
+      printf("build configuration: %s\n", libgav1::GetBuildConfiguration());
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-v") == 0) {
+      ++options->verbose;
+    } else if (strcmp(argv[i], "--raw") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeRaw;
+    } else if (strcmp(argv[i], "--y4m") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeY4m;
+    } else if (strcmp(argv[i], "--threads") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) {
+        fprintf(stderr, "Missing/Invalid value for --threads.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->threads = value;
+    } else if (strcmp(argv[i], "--frame_parallel") == 0) {
+      options->frame_parallel = true;
+    } else if (strcmp(argv[i], "--all_layers") == 0) {
+      options->output_all_layers = true;
+    } else if (strcmp(argv[i], "--operating_point") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 ||
+          value >= 32) {
+        fprintf(stderr, "Missing/Invalid value for --operating_point.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->operating_point = value;
+    } else if (strcmp(argv[i], "--limit") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --limit.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->limit = value;
+    } else if (strcmp(argv[i], "--skip") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --skip.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->skip = value;
+    } else if (strcmp(argv[i], "--post_filter_mask") == 0) {
+      errno = 0;
+      char* endptr = nullptr;
+      value = (++i >= argc) ? -1
+                            // NOLINTNEXTLINE(runtime/deprecated_fn)
+                            : static_cast<int32_t>(strtol(argv[i], &endptr, 0));
+      // Only the last 5 bits of the mask can be set.
+      if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) {
+        fprintf(stderr, "Invalid value for --post_filter_mask.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->post_filter_mask = value;
+    } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') {
+      fprintf(stderr, "Unknown option '%s'!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    } else {
+      if (options->input_file_name == nullptr) {
+        options->input_file_name = argv[i];
+      } else {
+        fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]);
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  if (argc < 2 || options->input_file_name == nullptr) {
+    fprintf(stderr, "Input file is required!\n");
+    PrintHelp(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+using InputBuffer = std::vector<uint8_t>;
+
+class InputBuffers {
+ public:
+  ~InputBuffers() {
+    for (auto buffer : free_buffers_) {
+      delete buffer;
+    }
+  }
+  InputBuffer* GetFreeBuffer() {
+    if (free_buffers_.empty()) {
+      auto* const buffer = new (std::nothrow) InputBuffer();
+      if (buffer == nullptr) {
+        fprintf(stderr, "Failed to create input buffer.\n");
+        return nullptr;
+      }
+      free_buffers_.push_back(buffer);
+    }
+    InputBuffer* const buffer = free_buffers_.front();
+    free_buffers_.pop_front();
+    return buffer;
+  }
+
+  void ReleaseInputBuffer(InputBuffer* buffer) {
+    free_buffers_.push_back(buffer);
+  }
+
+ private:
+  std::deque<InputBuffer*> free_buffers_;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+                        void* buffer_private_data) {
+  auto* const input_buffers = static_cast<InputBuffers*>(callback_private_data);
+  input_buffers->ReleaseInputBuffer(
+      static_cast<InputBuffer*>(buffer_private_data));
+}
+
+int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); }
+
+}  // namespace
+
+int main(int argc, char* argv[]) {
+  Options options;
+  ParseOptions(argc, argv, &options);
+
+  auto file_reader =
+      libgav1::FileReaderFactory::OpenReader(options.input_file_name);
+  if (file_reader == nullptr) {
+    fprintf(stderr, "Cannot open input file!\n");
+    return EXIT_FAILURE;
+  }
+
+  std::unique_ptr<FILE, decltype(&CloseFile)> frame_timing_file(nullptr,
+                                                                &CloseFile);
+  if (options.frame_timing_file_name != nullptr) {
+    frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb"));
+    if (frame_timing_file == nullptr) {
+      fprintf(stderr, "Cannot open frame timing file '%s'!\n",
+              options.frame_timing_file_name);
+      return EXIT_FAILURE;
+    }
+  }
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  // Reference frames + 1 scratch frame (for either the current frame or the
+  // film grain frame).
+  constexpr int kNumBuffers = 8 + 1;
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> cv_pixel_buffers =
+      Gav1DecodeCVPixelBufferPool::Create(kNumBuffers);
+  if (cv_pixel_buffers == nullptr) {
+    fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n");
+    return EXIT_FAILURE;
+  }
+#endif
+
+  InputBuffers input_buffers;
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings;
+  settings.post_filter_mask = options.post_filter_mask;
+  settings.threads = options.threads;
+  settings.frame_parallel = options.frame_parallel;
+  settings.output_all_layers = options.output_all_layers;
+  settings.operating_point = options.operating_point;
+  settings.blocking_dequeue = true;
+  settings.callback_private_data = &input_buffers;
+  settings.release_input_buffer = ReleaseInputBuffer;
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged;
+  settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer;
+  settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer;
+  settings.callback_private_data = cv_pixel_buffers.get();
+  settings.release_input_buffer = nullptr;
+  // TODO(vigneshv): Support frame parallel mode to be used with
+  // CVPixelBufferPool.
+  settings.frame_parallel = false;
+#endif
+  libgav1::StatusCode status = decoder.Init(&settings);
+  if (status != libgav1::kStatusOk) {
+    fprintf(stderr, "Error initializing decoder: %s\n",
+            libgav1::GetErrorString(status));
+    return EXIT_FAILURE;
+  }
+
+  fprintf(stderr, "decoding '%s'\n", options.input_file_name);
+  if (options.verbose > 0 && options.skip > 0) {
+    fprintf(stderr, "skipping %d frame(s).\n", options.skip);
+  }
+
+  int input_frames = 0;
+  int decoded_frames = 0;
+  Timing timing = {};
+  std::vector<FrameTiming> frame_timing;
+  const bool record_frame_timing = frame_timing_file != nullptr;
+  std::unique_ptr<libgav1::FileWriter> file_writer;
+  InputBuffer* input_buffer = nullptr;
+  bool limit_reached = false;
+  bool dequeue_finished = false;
+  const absl::Time decode_loop_start = absl::Now();
+  do {
+    if (input_buffer == nullptr && !file_reader->IsEndOfFile() &&
+        !limit_reached) {
+      input_buffer = input_buffers.GetFreeBuffer();
+      if (input_buffer == nullptr) return EXIT_FAILURE;
+      const absl::Time read_start = absl::Now();
+      if (!file_reader->ReadTemporalUnit(input_buffer,
+                                         /*timestamp=*/nullptr)) {
+        fprintf(stderr, "Error reading input file.\n");
+        return EXIT_FAILURE;
+      }
+      timing.input += absl::Now() - read_start;
+    }
+
+    if (++input_frames <= options.skip) {
+      input_buffers.ReleaseInputBuffer(input_buffer);
+      input_buffer = nullptr;
+      continue;
+    }
+
+    if (input_buffer != nullptr) {
+      if (input_buffer->empty()) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+        input_buffer = nullptr;
+        continue;
+      }
+
+      const absl::Time enqueue_start = absl::Now();
+      status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+                                    static_cast<int64_t>(frame_timing.size()),
+                                    /*buffer_private_data=*/input_buffer);
+      if (status == libgav1::kStatusOk) {
+        if (options.verbose > 1) {
+          fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size());
+        }
+        if (record_frame_timing) {
+          FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()};
+          frame_timing.emplace_back(enqueue_time);
+        }
+
+        input_buffer = nullptr;
+        // Continue to enqueue frames until we get a kStatusTryAgain status.
+        continue;
+      }
+      if (status != libgav1::kStatusTryAgain) {
+        fprintf(stderr, "Unable to enqueue frame: %s\n",
+                libgav1::GetErrorString(status));
+        return EXIT_FAILURE;
+      }
+    }
+
+    const libgav1::DecoderBuffer* buffer;
+    status = decoder.DequeueFrame(&buffer);
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+      continue;
+    }
+    if (status != libgav1::kStatusOk) {
+      fprintf(stderr, "Unable to dequeue frame: %s\n",
+              libgav1::GetErrorString(status));
+      return EXIT_FAILURE;
+    }
+    dequeue_finished = false;
+    if (buffer == nullptr) continue;
+    ++decoded_frames;
+    if (options.verbose > 1) {
+      fprintf(stderr, "buffer dequeued\n");
+    }
+
+    if (record_frame_timing) {
+      frame_timing[static_cast<int>(buffer->user_private_data)].dequeue =
+          absl::Now();
+    }
+
+    if (options.output_file_name != nullptr && file_writer == nullptr) {
+      libgav1::FileWriter::Y4mParameters y4m_parameters;
+      y4m_parameters.width = buffer->displayed_width[0];
+      y4m_parameters.height = buffer->displayed_height[0];
+      y4m_parameters.frame_rate_numerator = file_reader->frame_rate();
+      y4m_parameters.frame_rate_denominator = file_reader->time_scale();
+      y4m_parameters.chroma_sample_position = buffer->chroma_sample_position;
+      y4m_parameters.image_format = buffer->image_format;
+      y4m_parameters.bitdepth = static_cast<size_t>(buffer->bitdepth);
+      file_writer = libgav1::FileWriter::Open(
+          options.output_file_name, options.output_file_type, &y4m_parameters);
+      if (file_writer == nullptr) {
+        fprintf(stderr, "Cannot open output file!\n");
+        return EXIT_FAILURE;
+      }
+    }
+
+    if (!limit_reached && file_writer != nullptr &&
+        !file_writer->WriteFrame(*buffer)) {
+      fprintf(stderr, "Error writing output file.\n");
+      return EXIT_FAILURE;
+    }
+    if (options.limit > 0 && options.limit == decoded_frames) {
+      limit_reached = true;
+      if (input_buffer != nullptr) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+      }
+      input_buffer = nullptr;
+      // Clear any in progress frames to ensure the output frame limit is
+      // respected.
+      decoder.SignalEOS();
+    }
+  } while (input_buffer != nullptr ||
+           (!file_reader->IsEndOfFile() && !limit_reached) ||
+           !dequeue_finished);
+  timing.dequeue = absl::Now() - decode_loop_start - timing.input;
+
+  if (record_frame_timing) {
+    // Note timing for frame parallel will be skewed by the time spent queueing
+    // additional frames and in the output queue waiting for previous frames,
+    // the values reported won't be that meaningful.
+    fprintf(frame_timing_file.get(), "frame number\tdecode time us\n");
+    for (size_t i = 0; i < frame_timing.size(); ++i) {
+      const int decode_time_us = static_cast<int>(absl::ToInt64Microseconds(
+          frame_timing[i].dequeue - frame_timing[i].enqueue));
+      fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us);
+    }
+  }
+
+  if (options.verbose > 0) {
+    fprintf(stderr, "time to read input: %d us\n",
+            static_cast<int>(absl::ToInt64Microseconds(timing.input)));
+    const int decode_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(timing.dequeue));
+    const double decode_fps =
+        (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us;
+    fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n",
+            decode_time_us, decoded_frames, decode_fps);
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.cc b/examples/gav1_decode_cv_pixel_buffer_pool.cc
new file mode 100644 (file)
index 0000000..6aa4e61
--- /dev/null
@@ -0,0 +1,278 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+namespace {
+
+struct CFTypeDeleter {
+  void operator()(CFTypeRef cf) const { CFRelease(cf); }
+};
+
+using UniqueCFNumberRef =
+    std::unique_ptr<std::remove_pointer<CFNumberRef>::type, CFTypeDeleter>;
+
+using UniqueCFDictionaryRef =
+    std::unique_ptr<std::remove_pointer<CFDictionaryRef>::type, CFTypeDeleter>;
+
+}  // namespace
+
+extern "C" {
+
+libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->OnCVPixelBufferSizeChanged(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment);
+}
+
+libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->GetCVPixelBuffer(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                    void* buffer_private_data) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  buffer_pool->ReleaseCVPixelBuffer(buffer_private_data);
+}
+
+}  // extern "C"
+
+// static
+std::unique_ptr<Gav1DecodeCVPixelBufferPool>
+Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) {
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> buffer_pool(
+      new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers));
+  return buffer_pool;
+}
+
+Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers)
+    : num_buffers_(static_cast<int>(num_buffers)) {}
+
+Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() {
+  CVPixelBufferPoolRelease(pool_);
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged(
+    int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment) {
+  if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 &&
+                        image_format != libgav1::kImageFormatMonochrome400)) {
+    fprintf(stderr,
+            "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, "
+            "image_format: %d.\n",
+            bitdepth, image_format);
+    return libgav1::kStatusUnimplemented;
+  }
+
+  // stride_alignment must be a power of 2.
+  assert((stride_alignment & (stride_alignment - 1)) == 0);
+
+  // The possible keys for CVPixelBufferPool are:
+  //   kCVPixelBufferPoolMinimumBufferCountKey
+  //   kCVPixelBufferPoolMaximumBufferAgeKey
+  //   kCVPixelBufferPoolAllocationThresholdKey
+  const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey};
+  const int min_buffer_count = 10;
+  UniqueCFNumberRef cf_min_buffer_count(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count));
+  if (cf_min_buffer_count == nullptr) {
+    fprintf(stderr, "CFNumberCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  const void* pool_values[] = {cf_min_buffer_count.get()};
+  UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate(
+      nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks,
+      &kCFTypeDictionaryValueCallBacks));
+  if (pool_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be
+  // null and must contain the pixel format, width, and height, otherwise
+  // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes
+  // (-6682).
+
+  // I420: kCVPixelFormatType_420YpCbCr8Planar (video range).
+  const int pixel_format = (image_format == libgav1::kImageFormatYuv420)
+                               ? kCVPixelFormatType_420YpCbCr8PlanarFullRange
+                               : kCVPixelFormatType_OneComponent8;
+  UniqueCFNumberRef cf_pixel_format(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format));
+  UniqueCFNumberRef cf_width(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width));
+  UniqueCFNumberRef cf_height(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height));
+  UniqueCFNumberRef cf_left_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border));
+  UniqueCFNumberRef cf_right_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border));
+  UniqueCFNumberRef cf_top_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border));
+  UniqueCFNumberRef cf_bottom_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border));
+  UniqueCFNumberRef cf_stride_alignment(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPixelFormatTypeKey,
+      kCVPixelBufferWidthKey,
+      kCVPixelBufferHeightKey,
+      kCVPixelBufferExtendedPixelsLeftKey,
+      kCVPixelBufferExtendedPixelsRightKey,
+      kCVPixelBufferExtendedPixelsTopKey,
+      kCVPixelBufferExtendedPixelsBottomKey,
+      kCVPixelBufferBytesPerRowAlignmentKey,
+  };
+  const void* buffer_values[] = {
+      cf_pixel_format.get(),  cf_width.get(),
+      cf_height.get(),        cf_left_border.get(),
+      cf_right_border.get(),  cf_top_border.get(),
+      cf_bottom_border.get(), cf_stride_alignment.get(),
+  };
+  UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 8,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (buffer_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  CVPixelBufferPoolRef cv_pool;
+  CVReturn ret = CVPixelBufferPoolCreate(
+      /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(),
+      &cv_pool);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+  CVPixelBufferPoolRelease(pool_);
+  pool_ = cv_pool;
+  return libgav1::kStatusOk;
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer(
+    int bitdepth, libgav1::ImageFormat image_format, int /*width*/,
+    int /*height*/, int /*left_border*/, int /*right_border*/,
+    int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/,
+    libgav1::FrameBuffer* frame_buffer) {
+  static_cast<void>(bitdepth);
+  assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 ||
+                           image_format == libgav1::kImageFormatMonochrome400));
+  const bool is_monochrome =
+      (image_format == libgav1::kImageFormatMonochrome400);
+
+  // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey,
+  // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with
+  // kCVReturnWouldExceedAllocationThreshold (-6689).
+  UniqueCFNumberRef cf_num_buffers(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPoolAllocationThresholdKey,
+  };
+  const void* buffer_values[] = {
+      cf_num_buffers.get(),
+  };
+  UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 1,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (aux_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  CVPixelBufferRef pixel_buffer;
+  CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes(
+      /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr,
+            "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+
+  ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n",
+            static_cast<int>(ret));
+    CFRelease(pixel_buffer);
+    return libgav1::kStatusUnknownError;
+  }
+
+  // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel
+  // buffer is nonplanar (CVPixelBufferIsPlanar returns false and
+  // CVPixelBufferGetPlaneCount returns 0), but
+  // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane
+  // still work for plane index 0, even though the documentation says they
+  // return NULL for nonplanar pixel buffers.
+  frame_buffer->stride[0] =
+      static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0));
+  frame_buffer->plane[0] = static_cast<uint8_t*>(
+      CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0));
+  if (is_monochrome) {
+    frame_buffer->stride[1] = 0;
+    frame_buffer->stride[2] = 0;
+    frame_buffer->plane[1] = nullptr;
+    frame_buffer->plane[2] = nullptr;
+  } else {
+    frame_buffer->stride[1] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1));
+    frame_buffer->stride[2] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2));
+    frame_buffer->plane[1] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1));
+    frame_buffer->plane[2] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2));
+  }
+  frame_buffer->private_data = pixel_buffer;
+
+  return libgav1::kStatusOk;
+}
+
+void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer(
+    void* buffer_private_data) {
+  auto const pixel_buffer = static_cast<CVPixelBufferRef>(buffer_private_data);
+  CVReturn ret =
+      CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n",
+            __FILE__, __LINE__, static_cast<int>(ret));
+    abort();
+  }
+  CFRelease(pixel_buffer);
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.h b/examples/gav1_decode_cv_pixel_buffer_pool.h
new file mode 100644 (file)
index 0000000..7aee324
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+
+#include <CoreVideo/CoreVideo.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "gav1/frame_buffer.h"
+
+extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment);
+
+extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer);
+
+extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                               void* buffer_private_data);
+
+class Gav1DecodeCVPixelBufferPool {
+ public:
+  static std::unique_ptr<Gav1DecodeCVPixelBufferPool> Create(
+      size_t num_buffers);
+
+  // Not copyable or movable.
+  Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete;
+  Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) =
+      delete;
+
+  ~Gav1DecodeCVPixelBufferPool();
+
+  libgav1::StatusCode OnCVPixelBufferSizeChanged(
+      int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border,
+      int stride_alignment);
+
+  libgav1::StatusCode GetCVPixelBuffer(int bitdepth,
+                                       libgav1::ImageFormat image_format,
+                                       int width, int height, int left_border,
+                                       int right_border, int top_border,
+                                       int bottom_border, int stride_alignment,
+                                       libgav1::FrameBuffer* frame_buffer);
+  void ReleaseCVPixelBuffer(void* buffer_private_data);
+
+ private:
+  Gav1DecodeCVPixelBufferPool(size_t num_buffers);
+
+  CVPixelBufferPoolRef pool_ = nullptr;
+  const int num_buffers_;
+};
+
+#endif  // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
diff --git a/examples/ivf_parser.cc b/examples/ivf_parser.cc
new file mode 100644 (file)
index 0000000..f8adb14
--- /dev/null
@@ -0,0 +1,96 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/ivf_parser.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "examples/file_reader_constants.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+size_t ReadLittleEndian16(const uint8_t* const buffer) {
+  size_t value = buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+size_t ReadLittleEndian32(const uint8_t* const buffer) {
+  size_t value = buffer[3] << 24;
+  value |= buffer[2] << 16;
+  value |= buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+}  // namespace
+
+bool ParseIvfFileHeader(const uint8_t* const header_buffer,
+                        IvfFileHeader* const ivf_file_header) {
+  if (header_buffer == nullptr || ivf_file_header == nullptr) return false;
+
+  if (memcmp(kIvfSignature, header_buffer, 4) != 0) {
+    return false;
+  }
+
+  // Verify header version and length.
+  const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]);
+  if (ivf_header_version != kIvfHeaderVersion) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version");
+  }
+
+  const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]);
+  if (ivf_header_size != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size");
+    return false;
+  }
+
+  if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 &&
+      memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC");
+    return false;
+  }
+
+  ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]);
+  ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]);
+  ivf_file_header->frame_rate_numerator =
+      ReadLittleEndian32(&header_buffer[16]);
+  ivf_file_header->frame_rate_denominator =
+      ReadLittleEndian32(&header_buffer[20]);
+
+  return true;
+}
+
+bool ParseIvfFrameHeader(const uint8_t* const header_buffer,
+                         IvfFrameHeader* const ivf_frame_header) {
+  if (header_buffer == nullptr || ivf_frame_header == nullptr) return false;
+
+  ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer);
+  if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum");
+    return false;
+  }
+
+  ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]);
+  const uint64_t timestamp_hi =
+      static_cast<uint64_t>(ReadLittleEndian32(&header_buffer[8])) << 32;
+  ivf_frame_header->timestamp |= timestamp_hi;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/examples/ivf_parser.h b/examples/ivf_parser.h
new file mode 100644 (file)
index 0000000..b6bbc59
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_
+#define LIBGAV1_EXAMPLES_IVF_PARSER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+
+struct IvfFileHeader {
+  IvfFileHeader() = default;
+  IvfFileHeader(const IvfFileHeader& rhs) = default;
+  IvfFileHeader& operator=(const IvfFileHeader& rhs) = default;
+  IvfFileHeader(IvfFileHeader&& rhs) = default;
+  IvfFileHeader& operator=(IvfFileHeader&& rhs) = default;
+
+  size_t width = 0;
+  size_t height = 0;
+  size_t frame_rate_numerator = 0;
+  size_t frame_rate_denominator = 0;
+};
+
+struct IvfFrameHeader {
+  IvfFrameHeader() = default;
+  IvfFrameHeader(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader(IvfFrameHeader&& rhs) = default;
+  IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default;
+
+  size_t frame_size = 0;
+  int64_t timestamp = 0;
+};
+
+bool ParseIvfFileHeader(const uint8_t* header_buffer,
+                        IvfFileHeader* ivf_file_header);
+
+bool ParseIvfFrameHeader(const uint8_t* header_buffer,
+                         IvfFrameHeader* ivf_frame_header);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_IVF_PARSER_H_
diff --git a/examples/libgav1_examples.cmake b/examples/libgav1_examples.cmake
new file mode 100644 (file)
index 0000000..a3ec156
--- /dev/null
@@ -0,0 +1,70 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
+  return()
+endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
+set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+
+if(NOT LIBGAV1_ENABLE_EXAMPLES)
+  macro(libgav1_add_examples_targets)
+
+  endmacro()
+  return()
+endif()
+
+set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
+                                "${libgav1_examples}/file_reader.h"
+                                "${libgav1_examples}/file_reader_constants.cc"
+                                "${libgav1_examples}/file_reader_constants.h"
+                                "${libgav1_examples}/file_reader_factory.cc"
+                                "${libgav1_examples}/file_reader_factory.h"
+                                "${libgav1_examples}/file_reader_interface.h"
+                                "${libgav1_examples}/ivf_parser.cc"
+                                "${libgav1_examples}/ivf_parser.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc"
+                                "${libgav1_examples}/file_writer.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc")
+
+macro(libgav1_add_examples_targets)
+  libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES
+                      ${libgav1_file_reader_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES
+                      ${libgav1_file_writer_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_executable(NAME
+                         gav1_decode
+                         SOURCES
+                         ${libgav1_decode_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         ${libgav1_gtest_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_file_reader
+                         libgav1_file_writer
+                         LIB_DEPS
+                         absl::strings
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_dependency})
+endmacro()
diff --git a/examples/logging.h b/examples/logging.h
new file mode 100644 (file)
index 0000000..cf5a09f
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_LOGGING_H_
+#define LIBGAV1_EXAMPLES_LOGGING_H_
+
+#include <cstddef>
+#include <cstdio>
+
+namespace libgav1 {
+namespace examples {
+
+#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+  return (offset == 0 || file_name[offset - 1] == '/' ||
+          file_name[offset - 1] == '\\')
+             ? file_name + offset
+             : Basename(file_name, offset - 1);
+}
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string)                              \
+  do {                                                                        \
+    constexpr const char* libgav1_examples_basename =                         \
+        libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);          \
+    fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
+            __func__, error_string);                                          \
+  } while (false)
+
+#else  // !LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+  do {                                           \
+  } while (false)
+
+#endif  // LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+}  // namespace examples
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_LOGGING_H_
diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc
new file mode 100644 (file)
index 0000000..582f13c
--- /dev/null
@@ -0,0 +1,214 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+namespace {
+
+// Copies the feature_enabled, feature_data, segment_id_pre_skip, and
+// last_active_segment_id fields of Segmentation.
+void CopySegmentationParameters(const Segmentation& from, Segmentation* to) {
+  memcpy(to->feature_enabled, from.feature_enabled,
+         sizeof(to->feature_enabled));
+  memcpy(to->feature_data, from.feature_data, sizeof(to->feature_data));
+  to->segment_id_pre_skip = from.segment_id_pre_skip;
+  to->last_active_segment_id = from.last_active_segment_id;
+}
+
+}  // namespace
+
+RefCountedBuffer::RefCountedBuffer() = default;
+
+RefCountedBuffer::~RefCountedBuffer() = default;
+
+bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width,
+                               int height, int subsampling_x, int subsampling_y,
+                               int left_border, int right_border,
+                               int top_border, int bottom_border) {
+  // The YuvBuffer::Realloc() could call the get frame buffer callback which
+  // will need to be thread safe. So we ensure that we only call Realloc() once
+  // at any given time.
+  std::lock_guard<std::mutex> lock(pool_->mutex_);
+  assert(!buffer_private_data_valid_);
+  if (!yuv_buffer_.Realloc(
+          bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y,
+          left_border, right_border, top_border, bottom_border,
+          pool_->get_frame_buffer_, pool_->callback_private_data_,
+          &buffer_private_data_)) {
+    return false;
+  }
+  buffer_private_data_valid_ = true;
+  return true;
+}
+
+bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
+  upscaled_width_ = frame_header.upscaled_width;
+  frame_width_ = frame_header.width;
+  frame_height_ = frame_header.height;
+  render_width_ = frame_header.render_width;
+  render_height_ = frame_header.render_height;
+  rows4x4_ = frame_header.rows4x4;
+  columns4x4_ = frame_header.columns4x4;
+  if (frame_header.refresh_frame_flags != 0 &&
+      !IsIntraFrame(frame_header.frame_type)) {
+    const int rows4x4_half = DivideBy2(rows4x4_);
+    const int columns4x4_half = DivideBy2(columns4x4_);
+    if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+      return false;
+    }
+  }
+  return segmentation_map_.Allocate(rows4x4_, columns4x4_);
+}
+
+void RefCountedBuffer::SetGlobalMotions(
+    const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions) {
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    static_assert(sizeof(global_motion_[ref].params) ==
+                      sizeof(global_motions[ref].params),
+                  "");
+    memcpy(global_motion_[ref].params, global_motions[ref].params,
+           sizeof(global_motion_[ref].params));
+  }
+}
+
+void RefCountedBuffer::SetFrameContext(const SymbolDecoderContext& context) {
+  frame_context_ = context;
+  frame_context_.ResetIntraFrameYModeCdf();
+  frame_context_.ResetCounters();
+}
+
+void RefCountedBuffer::GetSegmentationParameters(
+    Segmentation* segmentation) const {
+  CopySegmentationParameters(/*from=*/segmentation_, /*to=*/segmentation);
+}
+
+void RefCountedBuffer::SetSegmentationParameters(
+    const Segmentation& segmentation) {
+  CopySegmentationParameters(/*from=*/segmentation, /*to=*/&segmentation_);
+}
+
+void RefCountedBuffer::SetBufferPool(BufferPool* pool) { pool_ = pool; }
+
+void RefCountedBuffer::ReturnToBufferPool(RefCountedBuffer* ptr) {
+  ptr->pool_->ReturnUnusedBuffer(ptr);
+}
+
+BufferPool::BufferPool(
+    FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+    GetFrameBufferCallback get_frame_buffer,
+    ReleaseFrameBufferCallback release_frame_buffer,
+    void* callback_private_data) {
+  if (get_frame_buffer != nullptr) {
+    // on_frame_buffer_size_changed may be null.
+    assert(release_frame_buffer != nullptr);
+    on_frame_buffer_size_changed_ = on_frame_buffer_size_changed;
+    get_frame_buffer_ = get_frame_buffer;
+    release_frame_buffer_ = release_frame_buffer;
+    callback_private_data_ = callback_private_data;
+  } else {
+    on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer_ = GetInternalFrameBuffer;
+    release_frame_buffer_ = ReleaseInternalFrameBuffer;
+    callback_private_data_ = &internal_frame_buffers_;
+  }
+}
+
+BufferPool::~BufferPool() {
+  for (const auto* buffer : buffers_) {
+    if (buffer->in_use_) {
+      assert(false && "RefCountedBuffer still in use at destruction time.");
+      LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time.");
+    }
+    delete buffer;
+  }
+}
+
+bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
+                                          Libgav1ImageFormat image_format,
+                                          int width, int height,
+                                          int left_border, int right_border,
+                                          int top_border, int bottom_border) {
+  if (on_frame_buffer_size_changed_ == nullptr) return true;
+  return on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                       image_format, width, height, left_border,
+                                       right_border, top_border, bottom_border,
+                                       /*stride_alignment=*/16) == kStatusOk;
+}
+
+RefCountedBufferPtr BufferPool::GetFreeBuffer() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (!buffer->in_use_) {
+      buffer->in_use_ = true;
+      buffer->progress_row_ = -1;
+      buffer->frame_state_ = kFrameStateUnknown;
+      buffer->hdr_cll_set_ = false;
+      buffer->hdr_mdcv_set_ = false;
+      buffer->itut_t35_set_ = false;
+      lock.unlock();
+      return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+    }
+  }
+  lock.unlock();
+  auto* const buffer = new (std::nothrow) RefCountedBuffer();
+  if (buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer.");
+    return RefCountedBufferPtr();
+  }
+  buffer->SetBufferPool(this);
+  buffer->in_use_ = true;
+  buffer->progress_row_ = -1;
+  buffer->frame_state_ = kFrameStateUnknown;
+  lock.lock();
+  const bool ok = buffers_.push_back(buffer);
+  lock.unlock();
+  if (!ok) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "Failed to push the new reference counted buffer into the vector.");
+    delete buffer;
+    return RefCountedBufferPtr();
+  }
+  return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+}
+
+void BufferPool::Abort() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (buffer->in_use_) {
+      buffer->Abort();
+    }
+  }
+}
+
+void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  assert(buffer->in_use_);
+  buffer->in_use_ = false;
+  if (buffer->buffer_private_data_valid_) {
+    release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_);
+    buffer->buffer_private_data_valid_ = false;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/buffer_pool.h b/src/buffer_pool.h
new file mode 100644 (file)
index 0000000..d4e50e0
--- /dev/null
@@ -0,0 +1,441 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_BUFFER_POOL_H_
+#define LIBGAV1_SRC_BUFFER_POOL_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+class BufferPool;
+
+enum FrameState : uint8_t {
+  kFrameStateUnknown,
+  kFrameStateStarted,
+  kFrameStateParsed,
+  kFrameStateDecoded
+};
+
+// A reference-counted frame buffer. Clients should access it via
+// RefCountedBufferPtr, which manages reference counting transparently.
+// The alignment requirement is due to the SymbolDecoderContext member
+// frame_context_.
+class RefCountedBuffer : public MaxAlignedAllocable {
+ public:
+  // Not copyable or movable.
+  RefCountedBuffer(const RefCountedBuffer&) = delete;
+  RefCountedBuffer& operator=(const RefCountedBuffer&) = delete;
+
+  // Allocates the YUV buffer. Returns true on success. Returns false on
+  // failure. This function ensures the thread safety of the |get_frame_buffer_|
+  // call (i.e.) only one |get_frame_buffer_| call will happen at a given time.
+  // TODO(b/142583029): In frame parallel mode, we can require the callbacks to
+  // be thread safe so that we can remove the thread safety of this function and
+  // applications can have fine grained locks.
+  //
+  // * |width| and |height| are the image dimensions in pixels.
+  // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+  //   subsampling of the width and height of the chroma planes, respectively.
+  // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+  //   the sizes (in pixels) of the borders on the left, right, top, and
+  //   bottom sides, respectively.
+  //
+  // NOTE: The strides are a multiple of 16. Since the first row in each plane
+  // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+  bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+               int subsampling_x, int subsampling_y, int left_border,
+               int right_border, int top_border, int bottom_border);
+
+  YuvBuffer* buffer() { return &yuv_buffer_; }
+
+  // Returns the buffer private data set by the get frame buffer callback when
+  // it allocated the YUV buffer.
+  void* buffer_private_data() const {
+    assert(buffer_private_data_valid_);
+    return buffer_private_data_;
+  }
+
+  // NOTE: In the current frame, this is the frame_type syntax element in the
+  // frame header. In a reference frame, this implements the RefFrameType array
+  // in the spec.
+  FrameType frame_type() const { return frame_type_; }
+  void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; }
+
+  // The sample position for subsampled streams. This is the
+  // chroma_sample_position syntax element in the sequence header.
+  //
+  // NOTE: The decoder does not use chroma_sample_position, but it needs to be
+  // passed on to the client in DecoderBuffer.
+  ChromaSamplePosition chroma_sample_position() const {
+    return chroma_sample_position_;
+  }
+  void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) {
+    chroma_sample_position_ = chroma_sample_position;
+  }
+
+  // Whether the frame can be used as show existing frame in the future.
+  bool showable_frame() const { return showable_frame_; }
+  void set_showable_frame(bool value) { showable_frame_ = value; }
+
+  // Sets upscaled_width_, frame_width_, frame_height_, render_width_,
+  // render_height_, rows4x4_ and columns4x4_ from the corresponding fields
+  // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+  // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+  // success, false on failure.
+  bool SetFrameDimensions(const ObuFrameHeader& frame_header);
+
+  int32_t upscaled_width() const { return upscaled_width_; }
+  int32_t frame_width() const { return frame_width_; }
+  int32_t frame_height() const { return frame_height_; }
+  // RenderWidth() and RenderHeight() return the render size, which is a hint
+  // to the application about the desired display size.
+  int32_t render_width() const { return render_width_; }
+  int32_t render_height() const { return render_height_; }
+  int32_t rows4x4() const { return rows4x4_; }
+  int32_t columns4x4() const { return columns4x4_; }
+
+  int spatial_id() const { return spatial_id_; }
+  void set_spatial_id(int value) { spatial_id_ = value; }
+  int temporal_id() const { return temporal_id_; }
+  void set_temporal_id(int value) { temporal_id_ = value; }
+
+  ObuMetadataHdrCll hdr_cll() const { return hdr_cll_; }
+  void set_hdr_cll(const ObuMetadataHdrCll& hdr_cll) {
+    hdr_cll_set_ = true;
+    hdr_cll_ = hdr_cll;
+  }
+  bool hdr_cll_set() const { return hdr_cll_set_; }
+
+  ObuMetadataHdrMdcv hdr_mdcv() const { return hdr_mdcv_; }
+  void set_hdr_mdcv(const ObuMetadataHdrMdcv& hdr_mdcv) {
+    hdr_mdcv_set_ = true;
+    hdr_mdcv_ = hdr_mdcv;
+  }
+  bool hdr_mdcv_set() const { return hdr_mdcv_set_; }
+
+  ObuMetadataItutT35 itut_t35() const { return itut_t35_; }
+  bool set_itut_t35(const ObuMetadataItutT35& itut_t35,
+                    const uint8_t* const payload) {
+    itut_t35_ = itut_t35;
+    if (itut_t35.payload_size > 0) {
+      if (!itut_t35_payload_.Resize(itut_t35.payload_size)) return false;
+      memcpy(itut_t35_payload_.get(), payload, itut_t35.payload_size);
+      itut_t35_.payload_bytes = itut_t35_payload_.get();
+    } else {
+      itut_t35_.payload_bytes = nullptr;
+    }
+    itut_t35_set_ = true;
+    return true;
+  }
+  bool itut_t35_set() const { return itut_t35_set_; }
+
+  SegmentationMap* segmentation_map() { return &segmentation_map_; }
+  const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
+
+  // Only the |params| field of each GlobalMotion struct should be used.
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& GlobalMotions()
+      const {
+    return global_motion_;
+  }
+  // Saves the GlobalMotion array. Only the |params| field of each GlobalMotion
+  // struct is saved.
+  void SetGlobalMotions(
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions);
+
+  // Returns the saved CDF tables.
+  const SymbolDecoderContext& FrameContext() const { return frame_context_; }
+  // Saves the CDF tables. The intra_frame_y_mode_cdf table is reset to the
+  // default. The last entry in each table, representing the symbol count for
+  // that context, is set to 0.
+  void SetFrameContext(const SymbolDecoderContext& context);
+
+  const std::array<int8_t, kNumReferenceFrameTypes>& loop_filter_ref_deltas()
+      const {
+    return loop_filter_ref_deltas_;
+  }
+  const std::array<int8_t, kLoopFilterMaxModeDeltas>& loop_filter_mode_deltas()
+      const {
+    return loop_filter_mode_deltas_;
+  }
+  // Saves the ref_deltas and mode_deltas arrays in loop_filter.
+  void SetLoopFilterDeltas(const LoopFilter& loop_filter) {
+    loop_filter_ref_deltas_ = loop_filter.ref_deltas;
+    loop_filter_mode_deltas_ = loop_filter.mode_deltas;
+  }
+
+  // Copies the saved values of the following fields to the Segmentation
+  // struct: feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id. The other fields are left unchanged.
+  void GetSegmentationParameters(Segmentation* segmentation) const;
+  // Saves the feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id fields of the Segmentation struct.
+  void SetSegmentationParameters(const Segmentation& segmentation);
+
+  const FilmGrainParams& film_grain_params() const {
+    return film_grain_params_;
+  }
+  void set_film_grain_params(const FilmGrainParams& params) {
+    film_grain_params_ = params;
+  }
+
+  const ReferenceInfo* reference_info() const { return &reference_info_; }
+  ReferenceInfo* reference_info() { return &reference_info_; }
+
+  // This will wake up the WaitUntil*() functions and make them return false.
+  void Abort() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      abort_ = true;
+    }
+    parsed_condvar_.notify_all();
+    decoded_condvar_.notify_all();
+    progress_row_condvar_.notify_all();
+  }
+
+  void SetFrameState(FrameState frame_state) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      frame_state_ = frame_state;
+    }
+    if (frame_state == kFrameStateParsed) {
+      parsed_condvar_.notify_all();
+    } else if (frame_state == kFrameStateDecoded) {
+      decoded_condvar_.notify_all();
+      progress_row_condvar_.notify_all();
+    }
+  }
+
+  // Sets the progress of this frame to |progress_row| and notifies any threads
+  // that may be waiting on rows <= |progress_row|.
+  void SetProgress(int progress_row) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (progress_row_ >= progress_row) return;
+      progress_row_ = progress_row;
+    }
+    progress_row_condvar_.notify_all();
+  }
+
+  void MarkFrameAsStarted() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (frame_state_ != kFrameStateUnknown) return;
+    frame_state_ = kFrameStateStarted;
+  }
+
+  // All the WaitUntil* functions will return true if the desired wait state was
+  // reached successfully. If the return value is false, then the caller must
+  // assume that the wait was not successful and try to stop whatever they are
+  // doing as early as possible.
+
+  // Waits until the frame has been parsed.
+  bool WaitUntilParsed() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ < kFrameStateParsed && !abort_) {
+      parsed_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
+  // Waits until the |progress_row| has been decoded (as indicated either by
+  // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+  // nullptr and will be populated with the value of |progress_row_| after the
+  // wait.
+  //
+  // Typical usage of |progress_row_cache| is as follows:
+  //  * Initialize |*progress_row_cache| to INT_MIN.
+  //  * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+  bool WaitUntil(int progress_row, int* progress_row_cache) {
+    // If |progress_row| is negative, it means that the wait is on the top
+    // border to be available. The top border will be available when row 0 has
+    // been decoded. So we can simply wait on row 0 instead.
+    progress_row = std::max(progress_row, 0);
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded &&
+           !abort_) {
+      progress_row_condvar_.wait(lock);
+    }
+    // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+    // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+    // case.
+    *progress_row_cache =
+        (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
+    return !abort_;
+  }
+
+  // Waits until the entire frame has been decoded.
+  bool WaitUntilDecoded() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ != kFrameStateDecoded && !abort_) {
+      decoded_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
+ private:
+  friend class BufferPool;
+
+  // Methods for BufferPool:
+  RefCountedBuffer();
+  ~RefCountedBuffer();
+  void SetBufferPool(BufferPool* pool);
+  static void ReturnToBufferPool(RefCountedBuffer* ptr);
+
+  BufferPool* pool_ = nullptr;
+  bool buffer_private_data_valid_ = false;
+  void* buffer_private_data_ = nullptr;
+  YuvBuffer yuv_buffer_;
+  bool in_use_ = false;  // Only used by BufferPool.
+
+  std::mutex mutex_;
+  FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_);
+  int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_);
+  // Signaled when progress_row_ is updated or when frame_state_ is set to
+  // kFrameStateDecoded.
+  std::condition_variable progress_row_condvar_;
+  // Signaled when the frame state is set to kFrameStateParsed.
+  std::condition_variable parsed_condvar_;
+  // Signaled when the frame state is set to kFrameStateDecoded.
+  std::condition_variable decoded_condvar_;
+  bool abort_ = false LIBGAV1_GUARDED_BY(mutex_);
+
+  FrameType frame_type_ = kFrameKey;
+  ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
+  bool showable_frame_ = false;
+
+  int32_t upscaled_width_ = 0;
+  int32_t frame_width_ = 0;
+  int32_t frame_height_ = 0;
+  int32_t render_width_ = 0;
+  int32_t render_height_ = 0;
+  int32_t columns4x4_ = 0;
+  int32_t rows4x4_ = 0;
+  int spatial_id_ = 0;
+  int temporal_id_ = 0;
+
+  ObuMetadataHdrCll hdr_cll_ = {};
+  bool hdr_cll_set_ = false;  // Set to true when set_hdr_cll() is called.
+  ObuMetadataHdrMdcv hdr_mdcv_ = {};
+  bool hdr_mdcv_set_ = false;  // Set to true when set_hdr_mdcv() is called.
+  ObuMetadataItutT35 itut_t35_ = {};
+  DynamicBuffer<uint8_t> itut_t35_payload_;
+  bool itut_t35_set_ = false;  // Set to true when set_itut_t35() is called.
+
+  // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
+  SegmentationMap segmentation_map_;
+
+  // Only the |params| field of each GlobalMotion struct is used.
+  // global_motion_[0] (for kReferenceFrameIntra) is not used.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion_ = {};
+  SymbolDecoderContext frame_context_;
+  std::array<int8_t, kNumReferenceFrameTypes> loop_filter_ref_deltas_;
+  std::array<int8_t, kLoopFilterMaxModeDeltas> loop_filter_mode_deltas_;
+  // Only the feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id fields of the Segmentation struct are used.
+  //
+  // Note: The spec only requires that we save feature_enabled and
+  // feature_data. Since segment_id_pre_skip and last_active_segment_id depend
+  // on feature_enabled only, we also save their values as an optimization.
+  Segmentation segmentation_ = {};
+  FilmGrainParams film_grain_params_ = {};
+  ReferenceInfo reference_info_;
+};
+
+// RefCountedBufferPtr contains a reference to a RefCountedBuffer.
+//
+// Note: For simplicity, RefCountedBufferPtr is implemented as a
+// std::shared_ptr<RefCountedBuffer>. This requires a heap allocation of the
+// control block for std::shared_ptr. To avoid that heap allocation, we can
+// add a |ref_count_| field to RefCountedBuffer and implement a custom
+// RefCountedBufferPtr class.
+using RefCountedBufferPtr = std::shared_ptr<RefCountedBuffer>;
+
+// BufferPool maintains a pool of RefCountedBuffers.
+class BufferPool {
+ public:
+  BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+             GetFrameBufferCallback get_frame_buffer,
+             ReleaseFrameBufferCallback release_frame_buffer,
+             void* callback_private_data);
+
+  // Not copyable or movable.
+  BufferPool(const BufferPool&) = delete;
+  BufferPool& operator=(const BufferPool&) = delete;
+
+  ~BufferPool();
+
+  LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged(
+      int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border);
+
+  // Finds a free buffer in the buffer pool and returns a reference to the free
+  // buffer. If there is no free buffer, returns a null pointer. This function
+  // is thread safe.
+  RefCountedBufferPtr GetFreeBuffer();
+
+  // Aborts all the buffers that are in use.
+  void Abort();
+
+ private:
+  friend class RefCountedBuffer;
+
+  // Returns an unused buffer to the buffer pool. Called by RefCountedBuffer
+  // only. This function is thread safe.
+  void ReturnUnusedBuffer(RefCountedBuffer* buffer);
+
+  // Used to make the following functions thread safe: GetFreeBuffer(),
+  // ReturnUnusedBuffer(), RefCountedBuffer::Realloc().
+  std::mutex mutex_;
+
+  // Storing a RefCountedBuffer object in a Vector is complicated because of the
+  // copy/move semantics. So the simplest way around that is to store a list of
+  // pointers in the vector.
+  Vector<RefCountedBuffer*> buffers_ LIBGAV1_GUARDED_BY(mutex_);
+  InternalFrameBufferList internal_frame_buffers_;
+
+  // Frame buffer callbacks.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+  GetFrameBufferCallback get_frame_buffer_;
+  ReleaseFrameBufferCallback release_frame_buffer_;
+  // Private data associated with the frame buffer callbacks.
+  void* callback_private_data_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_BUFFER_POOL_H_
diff --git a/src/buffer_pool_test.cc b/src/buffer_pool_test.cc
new file mode 100644 (file)
index 0000000..abe681e
--- /dev/null
@@ -0,0 +1,305 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <tuple>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/frame_buffer_utils.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(BufferPoolTest, RefCountedBufferPtr) {
+  InternalFrameBufferList buffer_list;
+  BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+                         GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+                         &buffer_list);
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+  EXPECT_EQ(buffer_ptr.use_count(), 1);
+
+  RefCountedBufferPtr buffer_ptr2 = buffer_ptr;
+  RefCountedBufferPtr buffer_ptr3 = buffer_ptr;
+  EXPECT_EQ(buffer_ptr.use_count(), 3);
+  EXPECT_EQ(buffer_ptr2.use_count(), 3);
+  EXPECT_EQ(buffer_ptr3.use_count(), 3);
+
+  buffer_ptr2 = nullptr;
+  EXPECT_EQ(buffer_ptr.use_count(), 2);
+  EXPECT_EQ(buffer_ptr2.use_count(), 0);
+  EXPECT_EQ(buffer_ptr3.use_count(), 2);
+
+  RefCountedBufferPtr buffer_ptr4 = std::move(buffer_ptr);
+  EXPECT_EQ(buffer_ptr.use_count(), 0);
+  EXPECT_EQ(buffer_ptr2.use_count(), 0);
+  EXPECT_EQ(buffer_ptr3.use_count(), 2);
+  EXPECT_EQ(buffer_ptr4.use_count(), 2);
+}
+
+TEST(RefCountedBufferTest, SetFrameDimensions) {
+  InternalFrameBufferList buffer_list;
+  BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+                         GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+                         &buffer_list);
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  // Test the undocumented default values of rows4x4() and columns4x4(). (Not
+  // sure if this is a good idea.)
+  EXPECT_EQ(buffer_ptr->rows4x4(), 0);
+  EXPECT_EQ(buffer_ptr->columns4x4(), 0);
+
+  // Test the side effects of SetFrameDimensions().
+  ObuFrameHeader frame_header = {};
+  frame_header.rows4x4 = 20;
+  frame_header.columns4x4 = 30;
+  EXPECT_TRUE(buffer_ptr->SetFrameDimensions(frame_header));
+  EXPECT_EQ(buffer_ptr->rows4x4(), 20);
+  EXPECT_EQ(buffer_ptr->columns4x4(), 30);
+}
+
+TEST(RefCountedBuffertTest, WaitUntil) {
+  InternalFrameBufferList buffer_list;
+  BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+                         GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+                         &buffer_list);
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  int progress_row_cache;
+  buffer_ptr->SetProgress(10);
+  EXPECT_TRUE(buffer_ptr->WaitUntil(5, &progress_row_cache));
+  EXPECT_EQ(progress_row_cache, 10);
+
+  buffer_ptr->SetFrameState(kFrameStateDecoded);
+  EXPECT_TRUE(buffer_ptr->WaitUntil(500, &progress_row_cache));
+  EXPECT_EQ(progress_row_cache, INT_MAX);
+
+  buffer_ptr->Abort();
+  EXPECT_FALSE(buffer_ptr->WaitUntil(50, &progress_row_cache));
+}
+
+constexpr struct Params {
+  int width;
+  int height;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  int border;
+} kParams[] = {
+    {1920, 1080, 1, 1, 96},   //
+    {1920, 1080, 1, 1, 64},   //
+    {1920, 1080, 1, 1, 32},   //
+    {1920, 1080, 1, 1, 160},  //
+    {1920, 1080, 1, 0, 160},  //
+    {1920, 1080, 0, 0, 160},  //
+};
+
+std::ostream& operator<<(std::ostream& os, const Params& param) {
+  return os << param.width << "x" << param.height
+            << ", subsampling(x/y): " << static_cast<int>(param.subsampling_x)
+            << "/" << static_cast<int>(param.subsampling_y)
+            << ", border: " << param.border;
+}
+
+class RefCountedBufferReallocTest
+    : public testing::TestWithParam<std::tuple<bool, Params>> {
+ protected:
+  const bool use_external_callbacks_ = std::get<0>(GetParam());
+  const Params& param_ = std::get<1>(GetParam());
+};
+
+TEST_P(RefCountedBufferReallocTest, 8Bit) {
+  InternalFrameBufferList buffer_list;
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+  GetFrameBufferCallback get_frame_buffer = nullptr;
+  ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+  void* callback_private_data = nullptr;
+  if (use_external_callbacks_) {
+    on_frame_buffer_size_changed = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer = GetInternalFrameBuffer;
+    release_frame_buffer = ReleaseInternalFrameBuffer;
+    callback_private_data = &buffer_list;
+  }
+
+  BufferPool buffer_pool(on_frame_buffer_size_changed, get_frame_buffer,
+                         release_frame_buffer, callback_private_data);
+
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  const Libgav1ImageFormat image_format = ComposeImageFormat(
+      /*is_monochrome=*/false, param_.subsampling_x, param_.subsampling_y);
+  EXPECT_TRUE(buffer_pool.OnFrameBufferSizeChanged(
+      /*bitdepth=*/8, image_format, param_.width, param_.height, param_.border,
+      param_.border, param_.border, param_.border));
+
+  EXPECT_TRUE(buffer_ptr->Realloc(
+      /*bitdepth=*/8, /*is_monochrome=*/false, param_.width, param_.height,
+      param_.subsampling_x, param_.subsampling_y, param_.border, param_.border,
+      param_.border, param_.border));
+
+  // The first row of each plane is aligned at 16-byte boundaries.
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneY)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneU)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneV)) % 16, 0);
+
+  // Subsequent rows are aligned at 16-byte boundaries.
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneY) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneU) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneV) % 16, 0);
+
+  // Check the borders.
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+
+  // Write to the upper-left corner of the border.
+  uint8_t* y_buffer = buffer_ptr->buffer()->data(kPlaneY);
+  int y_stride = buffer_ptr->buffer()->stride(kPlaneY);
+  y_buffer[-buffer_ptr->buffer()->left_border(kPlaneY) -
+           buffer_ptr->buffer()->top_border(kPlaneY) * y_stride] = 0;
+  // Write to the lower-right corner of the border.
+  uint8_t* v_buffer = buffer_ptr->buffer()->data(kPlaneV);
+  int v_stride = buffer_ptr->buffer()->stride(kPlaneV);
+  v_buffer[(buffer_ptr->buffer()->height(kPlaneV) +
+            buffer_ptr->buffer()->bottom_border(kPlaneV) - 1) *
+               v_stride +
+           buffer_ptr->buffer()->width(kPlaneV) +
+           buffer_ptr->buffer()->right_border(kPlaneV) - 1] = 0;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+TEST_P(RefCountedBufferReallocTest, 10Bit) {
+  InternalFrameBufferList buffer_list;
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+  GetFrameBufferCallback get_frame_buffer = nullptr;
+  ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+  void* callback_private_data = nullptr;
+  if (use_external_callbacks_) {
+    on_frame_buffer_size_changed = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer = GetInternalFrameBuffer;
+    release_frame_buffer = ReleaseInternalFrameBuffer;
+    callback_private_data = &buffer_list;
+  }
+
+  BufferPool buffer_pool(on_frame_buffer_size_changed, get_frame_buffer,
+                         release_frame_buffer, callback_private_data);
+
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  const Libgav1ImageFormat image_format = ComposeImageFormat(
+      /*is_monochrome=*/false, param_.subsampling_x, param_.subsampling_y);
+  EXPECT_TRUE(buffer_pool.OnFrameBufferSizeChanged(
+      /*bitdepth=*/8, image_format, param_.width, param_.height, param_.border,
+      param_.border, param_.border, param_.border));
+
+  EXPECT_TRUE(buffer_ptr->Realloc(
+      /*bitdepth=*/10, /*is_monochrome=*/false, param_.width, param_.height,
+      param_.subsampling_x, param_.subsampling_y, param_.border, param_.border,
+      param_.border, param_.border));
+
+  // The first row of each plane is aligned at 16-byte boundaries.
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneY)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneU)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneV)) % 16, 0);
+
+  // Subsequent rows are aligned at 16-byte boundaries.
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneY) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneU) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneV) % 16, 0);
+
+  // Check the borders.
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+
+  // Write to the upper-left corner of the border.
+  auto* y_buffer =
+      reinterpret_cast<uint16_t*>(buffer_ptr->buffer()->data(kPlaneY));
+  int y_stride = buffer_ptr->buffer()->stride(kPlaneY) / sizeof(uint16_t);
+  y_buffer[-buffer_ptr->buffer()->left_border(kPlaneY) -
+           buffer_ptr->buffer()->top_border(kPlaneY) * y_stride] = 0;
+  // Write to the lower-right corner of the border.
+  auto* v_buffer =
+      reinterpret_cast<uint16_t*>(buffer_ptr->buffer()->data(kPlaneV));
+  int v_stride = buffer_ptr->buffer()->stride(kPlaneV) / sizeof(uint16_t);
+  v_buffer[(buffer_ptr->buffer()->height(kPlaneV) +
+            buffer_ptr->buffer()->bottom_border(kPlaneV) - 1) *
+               v_stride +
+           buffer_ptr->buffer()->width(kPlaneV) +
+           buffer_ptr->buffer()->right_border(kPlaneV) - 1] = 0;
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(
+    Default, RefCountedBufferReallocTest,
+    testing::Combine(testing::Bool(),  // use_external_callbacks
+                     testing::ValuesIn(kParams)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/c_decoder_test.c b/src/c_decoder_test.c
new file mode 100644 (file)
index 0000000..7c6f8c8
--- /dev/null
@@ -0,0 +1,479 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __cplusplus
+#error Do not compile this file with a C++ compiler
+#endif
+
+// clang-format off
+#include "src/gav1/decoder.h"
+
+// Import the test frame #defines.
+#include "src/decoder_test_data.h"
+// clang-format on
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ASSERT_EQ(a, b)                                                      \
+  do {                                                                       \
+    if ((a) != (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) == (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C DecoderTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_NE(a, b)                                                      \
+  do {                                                                       \
+    if ((a) == (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) != (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C DecoderTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_TRUE(a)                                                   \
+  do {                                                                   \
+    if (!(a)) {                                                          \
+      fprintf(stderr, "Assertion failure: %s, at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                 \
+      fprintf(stderr, "C DecoderTest failed\n");                         \
+      exit(1);                                                           \
+    }                                                                    \
+  } while (0)
+
+#define ASSERT_FALSE(a)                                                     \
+  do {                                                                      \
+    if (a) {                                                                \
+      fprintf(stderr, "Assertion failure: !(%s), at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                    \
+      fprintf(stderr, "C DecoderTest failed\n");                            \
+      exit(1);                                                              \
+    }                                                                       \
+  } while (0)
+
+static const uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER,
+                                  OBU_FRAME_1};
+
+static const uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2};
+
+static const uint8_t kFrame1WithHdrCllAndHdrMdcv[] = {
+    OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL,
+    OBU_METADATA_HDR_MDCV, OBU_FRAME_1};
+
+static const uint8_t kFrame2WithItutT35[] = {
+    OBU_TEMPORAL_DELIMITER, OBU_METADATA_ITUT_T35, OBU_FRAME_2};
+
+typedef struct DecoderTest {
+  Libgav1Decoder* decoder;
+  int frames_in_use;
+  void* buffer_private_data;
+  void* released_input_buffer;
+} DecoderTest;
+
+static void DecoderTestInit(DecoderTest* test) {
+  test->decoder = NULL;
+  test->frames_in_use = 0;
+  test->buffer_private_data = NULL;
+  test->released_input_buffer = NULL;
+}
+
+static void DecoderTestIncrementFramesInUse(DecoderTest* test) {
+  ++test->frames_in_use;
+}
+
+static void DecoderTestDecrementFramesInUse(DecoderTest* test) {
+  --test->frames_in_use;
+}
+
+static void DecoderTestSetReleasedInputBuffer(DecoderTest* test,
+                                              void* released_input_buffer) {
+  test->released_input_buffer = released_input_buffer;
+}
+
+static void DecoderTestSetBufferPrivateData(DecoderTest* test,
+                                            void* buffer_private_data) {
+  test->buffer_private_data = buffer_private_data;
+}
+
+typedef struct FrameBufferPrivate {
+  uint8_t* data[3];
+} FrameBufferPrivate;
+
+static Libgav1StatusCode GetFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  Libgav1FrameBufferInfo info;
+  Libgav1StatusCode status = Libgav1ComputeFrameBufferInfo(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, &info);
+  if (status != kLibgav1StatusOk) return status;
+
+  FrameBufferPrivate* buffer_private =
+      (FrameBufferPrivate*)malloc(sizeof(FrameBufferPrivate));
+  if (buffer_private == NULL) return kLibgav1StatusOutOfMemory;
+
+  for (int i = 0; i < 3; ++i) {
+    const size_t size = (i == 0) ? info.y_buffer_size : info.uv_buffer_size;
+    buffer_private->data[i] = (uint8_t*)malloc(sizeof(uint8_t) * size);
+    if (buffer_private->data[i] == NULL) {
+      for (int j = 0; j < i; j++) {
+        free(buffer_private->data[j]);
+      }
+      free(buffer_private);
+      return kLibgav1StatusOutOfMemory;
+    }
+  }
+
+  uint8_t* const y_buffer = buffer_private->data[0];
+  uint8_t* const u_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[1] : NULL;
+  uint8_t* const v_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[2] : NULL;
+
+  status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer,
+                                 buffer_private, frame_buffer);
+  if (status != kLibgav1StatusOk) return status;
+
+  DecoderTest* const decoder_test = (DecoderTest*)callback_private_data;
+  DecoderTestIncrementFramesInUse(decoder_test);
+  DecoderTestSetBufferPrivateData(decoder_test, frame_buffer->private_data);
+  return kLibgav1StatusOk;
+}
+
+static void ReleaseFrameBuffer(void* callback_private_data,
+                               void* buffer_private_data) {
+  FrameBufferPrivate* buffer_private = (FrameBufferPrivate*)buffer_private_data;
+  for (int i = 0; i < 3; ++i) {
+    free(buffer_private->data[i]);
+  }
+  free(buffer_private);
+  DecoderTest* const decoder_test = (DecoderTest*)callback_private_data;
+  DecoderTestDecrementFramesInUse(decoder_test);
+}
+
+static void ReleaseInputBuffer(void* private_data, void* input_buffer) {
+  DecoderTestSetReleasedInputBuffer((DecoderTest*)private_data, input_buffer);
+}
+
+static void DecoderTestSetUp(DecoderTest* test) {
+  Libgav1DecoderSettings settings;
+  Libgav1DecoderSettingsInitDefault(&settings);
+  settings.frame_parallel = 0;  // false
+  settings.get_frame_buffer = GetFrameBuffer;
+  settings.release_frame_buffer = ReleaseFrameBuffer;
+  settings.callback_private_data = test;
+  settings.release_input_buffer = ReleaseInputBuffer;
+  ASSERT_EQ(test->decoder, NULL);
+  ASSERT_EQ(Libgav1DecoderCreate(&settings, &test->decoder), kLibgav1StatusOk);
+  ASSERT_NE(test->decoder, NULL);
+}
+
+static void DecoderTestAPIFlowForNonFrameParallelMode(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // In non-frame-parallel mode, decoding happens only in the DequeueFrame call.
+  // So there should be no frames in use yet.
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  // libgav1 has decoded frame1 and is holding a reference to it.
+  ASSERT_EQ(test.frames_in_use, 1);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Dequeue the output of frame2.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+  ASSERT_EQ(test.frames_in_use, 2);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Signal end of stream (method 1). This should ensure that all the references
+  // are released.
+  status = Libgav1DecoderSignalEOS(test.decoder);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // libgav1 should have released all the reference frames now.
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence.
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Dequeue the output of frame2.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+  ASSERT_EQ(test.frames_in_use, 2);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Signal end of stream (method 2). This should ensure that all the references
+  // are released.
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+
+  // libgav1 should have released all the frames now.
+  ASSERT_EQ(test.frames_in_use, 0);
+}
+
+static void
+DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // Until the output of frame1 is dequeued, no other frames can be enqueued.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusTryAgain);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Delete the decoder instance.
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+
+  ASSERT_EQ(test.frames_in_use, 0);
+}
+
+static void DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Signal end of stream before dequeuing the output of frame2.
+  status = Libgav1DecoderSignalEOS(test.decoder);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // In this case, the output of the last frame that was enqueued is lost (which
+  // is intentional since end of stream was signaled without dequeueing it).
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+}
+
+static void DecoderTestNonFrameParallelModeInvalidFrameAfterEOS(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer = NULL;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Signal end of stream.
+  status = Libgav1DecoderSignalEOS(test.decoder);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // libgav1 should have released all the reference frames now.
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence. But, we
+  // try to enqueue a frame that does not have a sequence header (which is not
+  // allowed).
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame2 (this will fail since no sequence header has
+  // been seen since the last EOS signal).
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusBitstreamError);
+  ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+}
+
+static void DecoderTestMetadataObu(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1WithHdrCllAndHdrMdcv,
+                                      sizeof(kFrame1WithHdrCllAndHdrMdcv), 0,
+                                      (uint8_t*)&kFrame1WithHdrCllAndHdrMdcv);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(buffer->has_hdr_cll, 1);
+  ASSERT_EQ(buffer->has_hdr_mdcv, 1);
+  ASSERT_EQ(buffer->has_itut_t35, 0);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1WithHdrCllAndHdrMdcv);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2WithItutT35,
+                                      sizeof(kFrame2WithItutT35), 0,
+                                      (uint8_t*)&kFrame2WithItutT35);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Dequeue the output of frame2.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(buffer->has_hdr_cll, 0);
+  ASSERT_EQ(buffer->has_hdr_mdcv, 0);
+  ASSERT_EQ(buffer->has_itut_t35, 1);
+  ASSERT_NE(buffer->itut_t35.payload_bytes, NULL);
+  ASSERT_NE(buffer->itut_t35.payload_size, 0);
+  ASSERT_EQ(test.released_input_buffer, &kFrame2WithItutT35);
+
+  ASSERT_EQ(test.frames_in_use, 2);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  status = Libgav1DecoderSignalEOS(test.decoder);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  Libgav1DecoderDestroy(test.decoder);
+}
+
+int main(void) {
+  fprintf(stderr, "C DecoderTest started\n");
+  DecoderTestAPIFlowForNonFrameParallelMode();
+  DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing();
+  DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame();
+  DecoderTestNonFrameParallelModeInvalidFrameAfterEOS();
+  DecoderTestMetadataObu();
+  fprintf(stderr, "C DecoderTest passed\n");
+  return 0;
+}
diff --git a/src/c_version_test.c b/src/c_version_test.c
new file mode 100644 (file)
index 0000000..e198ee7
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __cplusplus
+#error Do not compile this file with a C++ compiler
+#endif
+
+// clang-format off
+#include "src/gav1/version.h"
+// clang-format on
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ASSERT_EQ(a, b)                                                      \
+  do {                                                                       \
+    if ((a) != (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) == (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C VersionTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_NE(a, b)                                                      \
+  do {                                                                       \
+    if ((a) == (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) != (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C VersionTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_TRUE(a)                                                   \
+  do {                                                                   \
+    if (!(a)) {                                                          \
+      fprintf(stderr, "Assertion failure: %s, at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                 \
+      fprintf(stderr, "C VersionTest failed\n");                         \
+      exit(1);                                                           \
+    }                                                                    \
+  } while (0)
+
+#define ASSERT_FALSE(a)                                                     \
+  do {                                                                      \
+    if (a) {                                                                \
+      fprintf(stderr, "Assertion failure: !(%s), at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                    \
+      fprintf(stderr, "C VersionTest failed\n");                            \
+      exit(1);                                                              \
+    }                                                                       \
+  } while (0)
+
+static void VersionTestGetVersion(void) {
+  const int library_version = Libgav1GetVersion();
+  ASSERT_EQ((library_version >> 24) & 0xff, 0);
+  // Note if we link against a shared object there's potential for a mismatch
+  // if a different library is loaded at runtime.
+  ASSERT_EQ((library_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  ASSERT_EQ((library_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  ASSERT_EQ(library_version & 0xff, LIBGAV1_PATCH_VERSION);
+
+  const int header_version = LIBGAV1_VERSION;
+  ASSERT_EQ((header_version >> 24) & 0xff, 0);
+  ASSERT_EQ((header_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  ASSERT_EQ((header_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  ASSERT_EQ(header_version & 0xff, LIBGAV1_PATCH_VERSION);
+}
+
+static void VersionTestGetVersionString(void) {
+  const char* version = Libgav1GetVersionString();
+  ASSERT_NE(version, NULL);
+}
+
+static void VersionTestGetBuildConfiguration(void) {
+  const char* config = Libgav1GetBuildConfiguration();
+  ASSERT_NE(config, NULL);
+}
+
+int main(void) {
+  fprintf(stderr, "C VersionTest started\n");
+  VersionTestGetVersion();
+  VersionTestGetVersionString();
+  VersionTestGetBuildConfiguration();
+  fprintf(stderr, "C VersionTest passed\n");
+  return 0;
+}
diff --git a/src/decoder.cc b/src/decoder.cc
new file mode 100644 (file)
index 0000000..b9e43e0
--- /dev/null
@@ -0,0 +1,119 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <memory>
+#include <new>
+
+#include "src/decoder_impl.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings,
+                                       Libgav1Decoder** decoder_out) {
+  std::unique_ptr<libgav1::Decoder> cxx_decoder(new (std::nothrow)
+                                                    libgav1::Decoder());
+  if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory;
+
+  libgav1::DecoderSettings cxx_settings;
+  cxx_settings.threads = settings->threads;
+  cxx_settings.frame_parallel = settings->frame_parallel != 0;
+  cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0;
+  cxx_settings.on_frame_buffer_size_changed =
+      settings->on_frame_buffer_size_changed;
+  cxx_settings.get_frame_buffer = settings->get_frame_buffer;
+  cxx_settings.release_frame_buffer = settings->release_frame_buffer;
+  cxx_settings.release_input_buffer = settings->release_input_buffer;
+  cxx_settings.callback_private_data = settings->callback_private_data;
+  cxx_settings.output_all_layers = settings->output_all_layers != 0;
+  cxx_settings.operating_point = settings->operating_point;
+  cxx_settings.post_filter_mask = settings->post_filter_mask;
+
+  const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings);
+  if (status == kLibgav1StatusOk) {
+    *decoder_out = reinterpret_cast<Libgav1Decoder*>(cxx_decoder.release());
+  }
+  return status;
+}
+
+void Libgav1DecoderDestroy(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  delete cxx_decoder;
+}
+
+Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder,
+                                             const uint8_t* data, size_t size,
+                                             int64_t user_private_data,
+                                             void* buffer_private_data) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->EnqueueFrame(data, size, user_private_data,
+                                   buffer_private_data);
+}
+
+Libgav1StatusCode Libgav1DecoderDequeueFrame(
+    Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->DequeueFrame(out_ptr);
+}
+
+Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->SignalEOS();
+}
+
+int Libgav1DecoderGetMaxBitdepth() {
+  return libgav1::Decoder::GetMaxBitdepth();
+}
+
+}  // extern "C"
+
+namespace libgav1 {
+
+Decoder::Decoder() = default;
+
+Decoder::~Decoder() = default;
+
+StatusCode Decoder::Init(const DecoderSettings* const settings) {
+  if (impl_ != nullptr) return kStatusAlready;
+  if (settings != nullptr) settings_ = *settings;
+  return DecoderImpl::Create(&settings_, &impl_);
+}
+
+StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size,
+                                 int64_t user_private_data,
+                                 void* buffer_private_data) {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  return impl_->EnqueueFrame(data, size, user_private_data,
+                             buffer_private_data);
+}
+
+StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  return impl_->DequeueFrame(out_ptr);
+}
+
+StatusCode Decoder::SignalEOS() {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  // In non-frame-parallel mode, we have to release all the references. This
+  // simply means replacing the |impl_| with a new instance so that all the
+  // existing references are released and the state is cleared.
+  impl_ = nullptr;
+  return DecoderImpl::Create(&settings_, &impl_);
+}
+
+// static.
+int Decoder::GetMaxBitdepth() { return DecoderImpl::GetMaxBitdepth(); }
+
+}  // namespace libgav1
diff --git a/src/decoder_buffer_test.cc b/src/decoder_buffer_test.cc
new file mode 100644 (file)
index 0000000..b1d8bb8
--- /dev/null
@@ -0,0 +1,38 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_buffer.h"
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+// Tests the emulation of C++ enumerators by constexpr constants.
+TEST(DecoderBufferTest, EnumTest) {
+  ColorRange color_range = kLibgav1ColorRangeFull;
+
+  // Verify that we get the -Wswitch warning unless the switch statement
+  // handles both kColorRangeStudio and kColorRangeFull:
+  //   enumeration value 'kLibgav1ColorRangeFull' not handled in switch
+  switch (color_range) {
+    case kColorRangeStudio:
+      break;
+    case kColorRangeFull:
+      break;
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
new file mode 100644 (file)
index 0000000..e8de64a
--- /dev/null
@@ -0,0 +1,1740 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/decoder_impl.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <iterator>
+#include <new>
+#include <utility>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/film_grain.h"
+#include "src/frame_buffer_utils.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/prediction_mask.h"
+#include "src/threading_strategy.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+
+// Computes the bottom border size in pixels. If CDEF, loop restoration or
+// SuperRes is enabled, adds extra border pixels to facilitate those steps to
+// happen nearly in-place (a few extra rows instead of an entire frame buffer).
+// The logic in this function should match the corresponding logic for
+// |vertical_shift| in the PostFilter constructor.
+int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
+                          const bool do_superres, const int subsampling_y) {
+  int extra_border = 0;
+  if (do_cdef) {
+    extra_border += kCdefBorder;
+  } else if (do_restoration) {
+    // If CDEF is enabled, loop restoration is safe without extra border.
+    extra_border += kRestorationVerticalBorder;
+  }
+  if (do_superres) extra_border += kSuperResVerticalBorder;
+  // Double the number of extra bottom border pixels if the bottom border will
+  // be subsampled.
+  extra_border <<= subsampling_y;
+  return Align(kBorderPixels + extra_border, 2);  // Must be a multiple of 2.
+}
+
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+                            int count) {
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    frame_scratch_buffer->tile_decoding_failed = true;
+  }
+  std::condition_variable* const condvars =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  for (int i = 0; i < count; ++i) {
+    condvars[i].notify_one();
+  }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
+ public:
+  FrameScratchBufferReleaser(
+      FrameScratchBufferPool* frame_scratch_buffer_pool,
+      std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+      : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+        frame_scratch_buffer_(frame_scratch_buffer) {}
+  ~FrameScratchBufferReleaser() {
+    frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+  }
+
+ private:
+  FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+  std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
+};
+
+// Sets the |frame|'s segmentation map for two cases. The third case is handled
+// in Tile::DecodeBlock().
+void SetSegmentationMap(const ObuFrameHeader& frame_header,
+                        const SegmentationMap* prev_segment_ids,
+                        RefCountedBuffer* const frame) {
+  if (!frame_header.segmentation.enabled) {
+    // All segment_id's are 0.
+    frame->segmentation_map()->Clear();
+  } else if (!frame_header.segmentation.update_map) {
+    // Copy from prev_segment_ids.
+    if (prev_segment_ids == nullptr) {
+      // Treat a null prev_segment_ids pointer as if it pointed to a
+      // segmentation map containing all 0s.
+      frame->segmentation_map()->Clear();
+    } else {
+      frame->segmentation_map()->CopyFrom(*prev_segment_ids);
+    }
+  }
+}
+
+StatusCode DecodeTilesNonFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter) {
+  // Decode in superblock row order.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+              row4x4, tile_scratch_buffer.get())) {
+        return kLibgav1StatusUnknownError;
+      }
+    }
+    post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+  }
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesThreadedNonFrameParallel(
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter,
+    BlockingCounterWithStatus* const pending_tiles) {
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  const int num_workers = threading_strategy.tile_thread_count();
+  BlockingCounterWithStatus pending_workers(num_workers);
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  bool tile_decoding_failed = false;
+  // Submit tile decoding jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
+                                                     &tile_counter,
+                                                     &pending_workers,
+                                                     &pending_tiles]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->ParseAndDecode()) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        } else {
+          pending_tiles->Decrement(false);
+        }
+      }
+      pending_workers.Decrement(!failed);
+    });
+  }
+  // Have the current thread partake in tile decoding.
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!tile_decoding_failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->ParseAndDecode()) {
+        LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+        tile_decoding_failed = true;
+      }
+    } else {
+      pending_tiles->Decrement(false);
+    }
+  }
+  // Wait until all the workers are done. This ensures that all the tiles have
+  // been parsed.
+  tile_decoding_failed |= !pending_workers.Wait();
+  // Wait until all the tiles have been decoded.
+  tile_decoding_failed |= !pending_tiles->Wait();
+  if (tile_decoding_failed) return kStatusUnknownError;
+  assert(threading_strategy.post_filter_thread_pool() != nullptr);
+  post_filter->ApplyFilteringThreaded();
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  for (const auto& tile : tiles) {
+    if (!tile->Parse()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
+      return kStatusUnknownError;
+    }
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  // Mark frame as parsed.
+  current_frame->SetFrameState(kFrameStateParsed);
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) {
+    return kStatusOutOfMemory;
+  }
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  // Decode in superblock row order (inter prediction in the Tile class will
+  // block until the required superblocks in the reference frame are decoded).
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+              row4x4, tile_scratch_buffer.get())) {
+        LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
+                     tile_ptr->number());
+        return kStatusUnknownError;
+      }
+    }
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Mark frame as decoded (we no longer care about row-level progress since the
+  // entire frame has been decoded).
+  current_frame->SetFrameState(kFrameStateDecoded);
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+// deblocking filter for tile boundaries for the superblock row at |row4x4|.
+void ApplyDeblockingFilterForTileBoundaries(
+    PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+    const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+    int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+  // Apply vertical deblock filtering for the first 64 columns of each tile.
+  for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+    const Tile& tile = *tile_row_base[tile_column];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+  }
+  if (decode_entire_tiles_in_worker_threads &&
+      row4x4 == tile_row_base[0]->row4x4_start()) {
+    // This is the first superblock row of a tile row. In this case, apply
+    // horizontal deblock filtering for the entire superblock row.
+    post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+                                    frame_header.columns4x4, block_width4x4);
+  } else {
+    // Apply horizontal deblock filtering for the first 64 columns of the
+    // first tile.
+    const Tile& first_tile = *tile_row_base[0];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+        first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // previous tile and the first 64 columns of the current tile.
+    for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+      const Tile& tile = *tile_row_base[tile_column];
+      // If the previous tile has more than 64 columns, then include those
+      // for the horizontal deblock.
+      const Tile& previous_tile = *tile_row_base[tile_column - 1];
+      const int column4x4_start =
+          tile.column4x4_start() -
+          ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+            previous_tile.column4x4_start())
+               ? kNum4x4InLoopFilterUnit
+               : 0);
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    }
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // last tile.
+    const Tile& last_tile = *tile_row_base[tile_columns - 1];
+    // Identify the last column4x4 value and do horizontal filtering for
+    // that column4x4. The value of last column4x4 is the nearest multiple
+    // of 16 that is before tile.column4x4_end().
+    const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+    // If column4x4_start is the same as tile.column4x4_start() then it
+    // means that the last tile has <= 64 columns. So there is nothing left
+    // to deblock (since it was already deblocked in the loop above).
+    if (column4x4_start != last_tile.column4x4_start()) {
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          last_tile.column4x4_end(), block_width4x4);
+    }
+  }
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+// superblock row starting at |row4x4| for tile at index |tile_index| in the
+// list of tiles |tiles|. If the decoding is successful, then it does the
+// following:
+//   * Schedule the next superblock row in the current tile column for decoding
+//     (the next superblock row may be in a different tile than the current
+//     one).
+//   * If an entire superblock row of the frame has been decoded, it notifies
+//     the waiters (if there are any).
+void DecodeSuperBlockRowInTile(
+    const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+    const int superblock_size4x4, const int tile_columns,
+    const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (scratch_buffer == nullptr) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  Tile& tile = *tiles[tile_index];
+  const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+      row4x4, scratch_buffer.get());
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(scratch_buffer));
+  if (!ok) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  if (post_filter->DoDeblock()) {
+    // Apply vertical deblock filtering for all the columns in this tile except
+    // for the first 64 columns.
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+        superblock_size4x4);
+    // Apply horizontal deblock filtering for all the columns in this tile
+    // except for the first and the last 64 columns.
+    // Note about the last tile of each row: For the last tile, column4x4_end
+    // may not be a multiple of 16. In that case it is still okay to simply
+    // subtract 16 since ApplyDeblockFilter() will only do the filters in
+    // increments of 64 columns (or 32 columns for chroma with subsampling).
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+        tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+  }
+  const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+  const int index = row4x4 >> superblock_size4x4_log2;
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  bool notify;
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    notify = ++superblock_row_progress[index] == tile_columns;
+  }
+  if (notify) {
+    // We are done decoding this superblock row. Notify the post filtering
+    // thread.
+    superblock_row_progress_condvar[index].notify_one();
+  }
+  // Schedule the next superblock row (if one exists).
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  const int next_row4x4 = row4x4 + superblock_size4x4;
+  if (!tile.IsRow4x4Inside(next_row4x4)) {
+    tile_index += tile_columns;
+  }
+  if (tile_index >= tiles.size()) return;
+  pending_jobs->IncrementBy(1);
+  thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
+                        tile_columns, superblock_rows, frame_scratch_buffer,
+                        post_filter, pending_jobs]() {
+    DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+                              superblock_size4x4, tile_columns, superblock_rows,
+                              frame_scratch_buffer, post_filter, pending_jobs);
+    pending_jobs->Decrement();
+  });
+}
+
+StatusCode DecodeTilesThreadedFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  const int num_workers = thread_pool.num_threads();
+  BlockingCounterWithStatus parse_workers(num_workers);
+  // Submit tile parsing jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Parse()) {
+            LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        }
+      }
+      parse_workers.Decrement(!failed);
+    });
+  }
+
+  // Have the current thread participate in parsing.
+  bool failed = false;
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->Parse()) {
+        LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+        failed = true;
+      }
+    }
+  }
+
+  // Wait until all the parse workers are done. This ensures that all the tiles
+  // have been parsed.
+  if (!parse_workers.Wait() || failed) {
+    return kLibgav1StatusUnknownError;
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  current_frame->SetFrameState(kFrameStateParsed);
+
+  // Decode the frame.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header.use_128x128_superblock ? 5 : 4;
+  const int superblock_rows =
+      (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+  if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+      !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+          superblock_rows)) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  memset(superblock_row_progress, 0,
+         superblock_rows * sizeof(superblock_row_progress[0]));
+  frame_scratch_buffer->tile_decoding_failed = false;
+  const int tile_columns = frame_header.tile_info.tile_columns;
+  const bool decode_entire_tiles_in_worker_threads =
+      num_workers >= tile_columns;
+  BlockingCounter pending_jobs(
+      decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+  if (decode_entire_tiles_in_worker_threads) {
+    // Submit tile decoding jobs to the thread pool.
+    tile_counter = 0;
+    for (int i = 0; i < num_workers; ++i) {
+      thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+                            frame_scratch_buffer, superblock_rows]() {
+        bool failed = false;
+        int index;
+        while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+               tile_count) {
+          if (failed) continue;
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Decode(
+                  &frame_scratch_buffer->superblock_row_mutex,
+                  frame_scratch_buffer->superblock_row_progress.get(),
+                  frame_scratch_buffer->superblock_row_progress_condvar
+                      .get())) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+            SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+          }
+        }
+        pending_jobs.Decrement();
+      });
+    }
+  } else {
+    // Schedule the jobs for first tile row.
+    for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+      thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
+                            superblock_rows, frame_scratch_buffer, post_filter,
+                            &pending_jobs]() {
+        DecodeSuperBlockRowInTile(
+            tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+            frame_scratch_buffer, post_filter, &pending_jobs);
+        pending_jobs.Decrement();
+      });
+    }
+  }
+
+  // Current thread will do the post filters.
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+  for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4, ++index) {
+    if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+      tile_row_base += tile_columns;
+    }
+    {
+      std::unique_lock<std::mutex> lock(
+          frame_scratch_buffer->superblock_row_mutex);
+      while (superblock_row_progress[index] != tile_columns &&
+             !frame_scratch_buffer->tile_decoding_failed) {
+        superblock_row_progress_condvar[index].wait(lock);
+      }
+      if (frame_scratch_buffer->tile_decoding_failed) break;
+    }
+    if (post_filter->DoDeblock()) {
+      // Apply deblocking filter for the tile boundaries of this superblock row.
+      // The deblocking filter for the internal blocks will be applied in the
+      // tile worker threads. In this thread, we will only have to apply
+      // deblocking filter for the tile boundaries.
+      ApplyDeblockingFilterForTileBoundaries(
+          post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+          tile_columns, decode_entire_tiles_in_worker_threads);
+    }
+    // Apply all the post filters other than deblocking.
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/false);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Wait until all the pending jobs are done. This ensures that all the tiles
+  // have been decoded and wrapped up.
+  pending_jobs.Wait();
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    if (frame_scratch_buffer->tile_decoding_failed) {
+      return kLibgav1StatusUnknownError;
+    }
+  }
+
+  current_frame->SetFrameState(kFrameStateDecoded);
+  return kStatusOk;
+}
+
+}  // namespace
+
+// static
+StatusCode DecoderImpl::Create(const DecoderSettings* settings,
+                               std::unique_ptr<DecoderImpl>* output) {
+  if (settings->threads <= 0) {
+    LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
+    return kStatusInvalidArgument;
+  }
+  if (settings->frame_parallel) {
+    if (settings->release_input_buffer == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "release_input_buffer callback must not be null when "
+                   "frame_parallel is true.");
+      return kStatusInvalidArgument;
+    }
+  }
+  std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
+  if (impl == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
+    return kStatusOutOfMemory;
+  }
+  const StatusCode status = impl->Init();
+  if (status != kStatusOk) return status;
+  *output = std::move(impl);
+  return kStatusOk;
+}
+
+DecoderImpl::DecoderImpl(const DecoderSettings* settings)
+    : buffer_pool_(settings->on_frame_buffer_size_changed,
+                   settings->get_frame_buffer, settings->release_frame_buffer,
+                   settings->callback_private_data),
+      settings_(*settings) {
+  dsp::DspInit();
+}
+
+DecoderImpl::~DecoderImpl() {
+  // Clean up and wait until all the threads have stopped. We just have to pass
+  // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
+  // path that clears all the threads and structs.
+  SignalFailure(kStatusUnknownError);
+  // Release any other frame buffer references that we may be holding on to.
+  ReleaseOutputFrame();
+  output_frame_queue_.Clear();
+  for (auto& reference_frame : state_.reference_frame) {
+    reference_frame = nullptr;
+  }
+}
+
+StatusCode DecoderImpl::Init() {
+  if (!output_frame_queue_.Init(kMaxLayers)) {
+    LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+    const uint8_t* data, size_t size) {
+  is_frame_parallel_ = false;
+  if (settings_.frame_parallel) {
+    DecoderState state;
+    std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+        data, size, settings_.operating_point, &buffer_pool_, &state));
+    if (obu == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+      return kStatusOutOfMemory;
+    }
+    RefCountedBufferPtr current_frame;
+    const StatusCode status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    current_frame = nullptr;
+    // We assume that the first frame that was parsed will contain the frame
+    // header. This assumption is usually true in practice. So we will simply
+    // not use frame parallel mode if this is not the case.
+    if (settings_.threads > 1 &&
+        !InitializeThreadPoolsForFrameParallel(
+            settings_.threads, obu->frame_header().tile_info.tile_count,
+            obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+            &frame_scratch_buffer_pool_)) {
+      return kStatusOutOfMemory;
+    }
+  }
+  const int max_allowed_frames =
+      (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
+  assert(max_allowed_frames > 0);
+  if (!temporal_units_.Init(max_allowed_frames)) {
+    LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  is_frame_parallel_ = frame_thread_pool_ != nullptr;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
+                                     int64_t user_private_data,
+                                     void* buffer_private_data) {
+  if (data == nullptr || size == 0) return kStatusInvalidArgument;
+  if (HasFailure()) return kStatusUnknownError;
+  if (!seen_first_frame_) {
+    seen_first_frame_ = true;
+    const StatusCode status =
+        InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+    if (status != kStatusOk) {
+      return SignalFailure(status);
+    }
+  }
+  if (temporal_units_.Full()) {
+    return kStatusTryAgain;
+  }
+  if (is_frame_parallel_) {
+    return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
+  }
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  temporal_units_.Push(std::move(temporal_unit));
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::SignalFailure(StatusCode status) {
+  if (status == kStatusOk || status == kStatusTryAgain) return status;
+  // Set the |failure_status_| first so that any pending jobs in
+  // |frame_thread_pool_| will exit right away when the thread pool is being
+  // released below.
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    failure_status_ = status;
+  }
+  // Make sure all waiting threads exit.
+  buffer_pool_.Abort();
+  frame_thread_pool_ = nullptr;
+  while (!temporal_units_.Empty()) {
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(
+          settings_.callback_private_data,
+          temporal_units_.Front().buffer_private_data);
+    }
+    temporal_units_.Pop();
+  }
+  return status;
+}
+
+// DequeueFrame() follows the following policy to avoid holding unnecessary
+// frame buffer references in output_frame_: output_frame_ must be null when
+// DequeueFrame() returns false.
+StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
+  if (out_ptr == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
+    return kStatusInvalidArgument;
+  }
+  // We assume a call to DequeueFrame() indicates that the caller is no longer
+  // using the previous output frame, so we can release it.
+  ReleaseOutputFrame();
+  if (temporal_units_.Empty()) {
+    // No input frames to decode.
+    *out_ptr = nullptr;
+    return kStatusNothingToDequeue;
+  }
+  TemporalUnit& temporal_unit = temporal_units_.Front();
+  if (!is_frame_parallel_) {
+    // If |output_frame_queue_| is not empty, then return the first frame from
+    // that queue.
+    if (!output_frame_queue_.Empty()) {
+      RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
+      output_frame_queue_.Pop();
+      buffer_.user_private_data = temporal_unit.user_private_data;
+      if (output_frame_queue_.Empty()) {
+        temporal_units_.Pop();
+      }
+      const StatusCode status = CopyFrameToOutputBuffer(frame);
+      if (status != kStatusOk) {
+        return status;
+      }
+      *out_ptr = &buffer_;
+      return kStatusOk;
+    }
+    // Decode the next available temporal unit and return.
+    const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
+    if (status != kStatusOk) {
+      // In case of failure, discard all the output frames that we may be
+      // holding on references to.
+      output_frame_queue_.Clear();
+    }
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(settings_.callback_private_data,
+                                     temporal_unit.buffer_private_data);
+    }
+    if (output_frame_queue_.Empty()) {
+      temporal_units_.Pop();
+    }
+    return status;
+  }
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (settings_.blocking_dequeue) {
+      while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        decoded_condvar_.wait(lock);
+      }
+    } else {
+      if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        return kStatusTryAgain;
+      }
+    }
+    if (failure_status_ != kStatusOk) {
+      const StatusCode failure_status = failure_status_;
+      lock.unlock();
+      return SignalFailure(failure_status);
+    }
+  }
+  if (settings_.release_input_buffer != nullptr &&
+      !temporal_unit.released_input_buffer) {
+    temporal_unit.released_input_buffer = true;
+    settings_.release_input_buffer(settings_.callback_private_data,
+                                   temporal_unit.buffer_private_data);
+  }
+  if (temporal_unit.status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(temporal_unit.status);
+  }
+  if (!temporal_unit.has_displayable_frame) {
+    *out_ptr = nullptr;
+    temporal_units_.Pop();
+    return kStatusOk;
+  }
+  assert(temporal_unit.output_layer_count > 0);
+  StatusCode status = CopyFrameToOutputBuffer(
+      temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
+      nullptr;
+  if (status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(status);
+  }
+  buffer_.user_private_data = temporal_unit.user_private_data;
+  *out_ptr = &buffer_;
+  if (--temporal_unit.output_layer_count == 0) {
+    temporal_units_.Pop();
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
+                                         int64_t user_private_data,
+                                         void* buffer_private_data) {
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  int position_in_temporal_unit = 0;
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      // TODO(vigneshv): This may not be the right place to call this callback
+      // for the frame parallel case. Investigate and fix it.
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
+    }
+    // This can happen when there are multiple spatial/temporal layers and if
+    // all the layers are outside the current operating point.
+    if (current_frame == nullptr) {
+      continue;
+    }
+    // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
+    // in the code below after |temporal_unit| is std::move'd into the
+    // |temporal_units_| queue.
+    if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
+                                           position_in_temporal_unit++)) {
+      LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
+      return kStatusOutOfMemory;
+    }
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
+  }
+  // This function cannot fail after this point. So it is okay to move the
+  // |temporal_unit| into |temporal_units_| queue.
+  temporal_units_.Push(std::move(temporal_unit));
+  if (temporal_units_.Back().frames.empty()) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    temporal_units_.Back().has_displayable_frame = false;
+    temporal_units_.Back().decoded = true;
+    return kStatusOk;
+  }
+  for (auto& frame : temporal_units_.Back().frames) {
+    EncodedFrame* const encoded_frame = &frame;
+    encoded_frame->temporal_unit = &temporal_units_.Back();
+    frame_thread_pool_->Schedule([this, encoded_frame]() {
+      if (HasFailure()) return;
+      const StatusCode status = DecodeFrame(encoded_frame);
+      encoded_frame->state = {};
+      encoded_frame->frame = nullptr;
+      TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (failure_status_ != kStatusOk) return;
+      // temporal_unit's status defaults to kStatusOk. So we need to set it only
+      // on error. If |failure_status_| is not kStatusOk at this point, it means
+      // that there has already been a failure. So we don't care about this
+      // subsequent failure.  We will simply return the error code of the first
+      // failure.
+      if (status != kStatusOk) {
+        temporal_unit.status = status;
+        if (failure_status_ == kStatusOk) {
+          failure_status_ = status;
+        }
+      }
+      temporal_unit.decoded =
+          ++temporal_unit.decoded_count == temporal_unit.frames.size();
+      if (temporal_unit.decoded && settings_.output_all_layers &&
+          temporal_unit.output_layer_count > 1) {
+        std::sort(
+            temporal_unit.output_layers,
+            temporal_unit.output_layers + temporal_unit.output_layer_count);
+      }
+      if (temporal_unit.decoded || failure_status_ != kStatusOk) {
+        decoded_condvar_.notify_one();
+      }
+    });
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
+  const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
+  const ObuFrameHeader& frame_header = encoded_frame->frame_header;
+  RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  StatusCode status;
+  if (!frame_header.show_existing_frame) {
+    if (encoded_frame->tile_buffers.empty()) {
+      // This means that the last call to ParseOneFrame() did not actually
+      // have any tile groups. This could happen in rare cases (for example,
+      // if there is a Metadata OBU after the TileGroup OBU). We currently do
+      // not have a reason to handle those cases, so we simply continue.
+      return kStatusOk;
+    }
+    status = DecodeTiles(sequence_header, frame_header,
+                         encoded_frame->tile_buffers, encoded_frame->state,
+                         frame_scratch_buffer.get(), current_frame.get());
+    if (status != kStatusOk) {
+      return status;
+    }
+  } else {
+    if (!current_frame->WaitUntilDecoded()) {
+      return kStatusUnknownError;
+    }
+  }
+  if (!frame_header.show_frame && !frame_header.show_existing_frame) {
+    // This frame is not displayable. Not an error.
+    return kStatusOk;
+  }
+  RefCountedBufferPtr film_grain_frame;
+  status = ApplyFilmGrain(
+      sequence_header, frame_header, current_frame, &film_grain_frame,
+      frame_scratch_buffer->threading_strategy.thread_pool());
+  if (status != kStatusOk) {
+    return status;
+  }
+
+  TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
+    assert(temporal_unit.output_frame_position >= 0);
+    // A displayable frame was already found in this temporal unit. This can
+    // happen if there are multiple spatial/temporal layers. Since
+    // |settings_.output_all_layers| is false, we will output only the last
+    // displayable frame.
+    if (temporal_unit.output_frame_position >
+        encoded_frame->position_in_temporal_unit) {
+      return kStatusOk;
+    }
+    // Replace any output frame that we may have seen before with the current
+    // frame.
+    assert(temporal_unit.output_layer_count == 1);
+    --temporal_unit.output_layer_count;
+  }
+  temporal_unit.has_displayable_frame = true;
+  temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
+      std::move(film_grain_frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count]
+      .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
+  ++temporal_unit.output_layer_count;
+  temporal_unit.output_frame_position =
+      encoded_frame->position_in_temporal_unit;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                           const DecoderBuffer** out_ptr) {
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
+    }
+    if (!obu->frame_header().show_existing_frame) {
+      if (obu->tile_buffers().empty()) {
+        // This means that the last call to ParseOneFrame() did not actually
+        // have any tile groups. This could happen in rare cases (for example,
+        // if there is a Metadata OBU after the TileGroup OBU). We currently do
+        // not have a reason to handle those cases, so we simply continue.
+        continue;
+      }
+      status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
+                           obu->tile_buffers(), state_,
+                           frame_scratch_buffer.get(), current_frame.get());
+      if (status != kStatusOk) {
+        return status;
+      }
+    }
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
+    if (obu->frame_header().show_frame ||
+        obu->frame_header().show_existing_frame) {
+      if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
+        // There is more than one displayable frame in the current operating
+        // point and |settings_.output_all_layers| is false. In this case, we
+        // simply return the last displayable frame as the output frame and
+        // ignore the rest.
+        assert(output_frame_queue_.Size() == 1);
+        output_frame_queue_.Pop();
+      }
+      RefCountedBufferPtr film_grain_frame;
+      status = ApplyFilmGrain(
+          obu->sequence_header(), obu->frame_header(), current_frame,
+          &film_grain_frame,
+          frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
+      if (status != kStatusOk) return status;
+      output_frame_queue_.Push(std::move(film_grain_frame));
+    }
+  }
+  if (output_frame_queue_.Empty()) {
+    // No displayable frame in the temporal unit. Not an error.
+    *out_ptr = nullptr;
+    return kStatusOk;
+  }
+  status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
+  output_frame_queue_.Pop();
+  if (status != kStatusOk) {
+    return status;
+  }
+  buffer_.user_private_data = temporal_unit.user_private_data;
+  *out_ptr = &buffer_;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::CopyFrameToOutputBuffer(
+    const RefCountedBufferPtr& frame) {
+  YuvBuffer* yuv_buffer = frame->buffer();
+
+  buffer_.chroma_sample_position = frame->chroma_sample_position();
+
+  if (yuv_buffer->is_monochrome()) {
+    buffer_.image_format = kImageFormatMonochrome400;
+  } else {
+    if (yuv_buffer->subsampling_x() == 0 && yuv_buffer->subsampling_y() == 0) {
+      buffer_.image_format = kImageFormatYuv444;
+    } else if (yuv_buffer->subsampling_x() == 1 &&
+               yuv_buffer->subsampling_y() == 0) {
+      buffer_.image_format = kImageFormatYuv422;
+    } else if (yuv_buffer->subsampling_x() == 1 &&
+               yuv_buffer->subsampling_y() == 1) {
+      buffer_.image_format = kImageFormatYuv420;
+    } else {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid chroma subsampling values: cannot determine buffer "
+                   "image format.");
+      return kStatusInvalidArgument;
+    }
+  }
+  buffer_.color_range = sequence_header_.color_config.color_range;
+  buffer_.color_primary = sequence_header_.color_config.color_primary;
+  buffer_.transfer_characteristics =
+      sequence_header_.color_config.transfer_characteristics;
+  buffer_.matrix_coefficients =
+      sequence_header_.color_config.matrix_coefficients;
+
+  buffer_.bitdepth = yuv_buffer->bitdepth();
+  const int num_planes =
+      yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
+  int plane = kPlaneY;
+  for (; plane < num_planes; ++plane) {
+    buffer_.stride[plane] = yuv_buffer->stride(plane);
+    buffer_.plane[plane] = yuv_buffer->data(plane);
+    buffer_.displayed_width[plane] = yuv_buffer->width(plane);
+    buffer_.displayed_height[plane] = yuv_buffer->height(plane);
+  }
+  for (; plane < kMaxPlanes; ++plane) {
+    buffer_.stride[plane] = 0;
+    buffer_.plane[plane] = nullptr;
+    buffer_.displayed_width[plane] = 0;
+    buffer_.displayed_height[plane] = 0;
+  }
+  buffer_.spatial_id = frame->spatial_id();
+  buffer_.temporal_id = frame->temporal_id();
+  buffer_.buffer_private_data = frame->buffer_private_data();
+  if (frame->hdr_cll_set()) {
+    buffer_.has_hdr_cll = 1;
+    buffer_.hdr_cll = frame->hdr_cll();
+  } else {
+    buffer_.has_hdr_cll = 0;
+  }
+  if (frame->hdr_mdcv_set()) {
+    buffer_.has_hdr_mdcv = 1;
+    buffer_.hdr_mdcv = frame->hdr_mdcv();
+  } else {
+    buffer_.has_hdr_mdcv = 0;
+  }
+  if (frame->itut_t35_set()) {
+    buffer_.has_itut_t35 = 1;
+    buffer_.itut_t35 = frame->itut_t35();
+  } else {
+    buffer_.has_itut_t35 = 0;
+  }
+  output_frame_ = frame;
+  return kStatusOk;
+}
+
+void DecoderImpl::ReleaseOutputFrame() {
+  for (auto& plane : buffer_.plane) {
+    plane = nullptr;
+  }
+  output_frame_ = nullptr;
+}
+
+StatusCode DecoderImpl::DecodeTiles(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
+    const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+    RefCountedBuffer* const current_frame) {
+  frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
+      sequence_header.color_config.bitdepth);
+  if (!frame_scratch_buffer->loop_restoration_info.Reset(
+          &frame_header.loop_restoration, frame_header.upscaled_width,
+          frame_header.height, sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.is_monochrome)) {
+    LIBGAV1_DLOG(ERROR,
+                 "Failed to allocate memory for loop restoration info units.");
+    return kStatusOutOfMemory;
+  }
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  if (!is_frame_parallel_ &&
+      !threading_strategy.Reset(frame_header, settings_.threads)) {
+    return kStatusOutOfMemory;
+  }
+  const bool do_cdef =
+      PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
+  const int num_planes = sequence_header.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  const bool do_restoration = PostFilter::DoRestoration(
+      frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
+  const bool do_superres =
+      PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
+  // Use kBorderPixels for the left, right, and top borders. Only the bottom
+  // border may need to be bigger. Cdef border is needed only if we apply Cdef
+  // without multithreading.
+  const int bottom_border = GetBottomBorderPixels(
+      do_cdef && threading_strategy.post_filter_thread_pool() == nullptr,
+      do_restoration, do_superres, sequence_header.color_config.subsampling_y);
+  current_frame->set_chroma_sample_position(
+      sequence_header.color_config.chroma_sample_position);
+  if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
+                              sequence_header.color_config.is_monochrome,
+                              frame_header.upscaled_width, frame_header.height,
+                              sequence_header.color_config.subsampling_x,
+                              sequence_header.color_config.subsampling_y,
+                              /*left_border=*/kBorderPixels,
+                              /*right_border=*/kBorderPixels,
+                              /*top_border=*/kBorderPixels, bottom_border)) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
+    return kStatusOutOfMemory;
+  }
+  if (frame_header.cdef.bits > 0) {
+    if (!frame_scratch_buffer->cdef_index.Reset(
+            DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
+            DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+            /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
+      return kStatusOutOfMemory;
+    }
+  }
+  if (do_cdef) {
+    if (!frame_scratch_buffer->cdef_skip.Reset(
+            DivideBy2(frame_header.rows4x4 + kMaxBlockHeight4x4),
+            DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+            /*zero_initialize=*/true)) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef skip.");
+      return kStatusOutOfMemory;
+    }
+  }
+  if (!frame_scratch_buffer->inter_transform_sizes.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4,
+          /*zero_initialize=*/false)) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
+    return kStatusOutOfMemory;
+  }
+  if (frame_header.use_ref_frame_mvs) {
+    if (!frame_scratch_buffer->motion_field.mv.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false) ||
+        !frame_scratch_buffer->motion_field.reference_offset.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to allocate memory for temporal motion vectors.");
+      return kStatusOutOfMemory;
+    }
+
+    // For each motion vector, only mv[0] needs to be initialized to
+    // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
+    // set to an arbitrary value. For simplicity, mv[1] is set to 0.
+    // The following memory initialization of contiguous memory is very fast. It
+    // is not recommended to make the initialization multi-threaded, unless the
+    // memory which needs to be initialized in each thread is still contiguous.
+    MotionVector invalid_mv;
+    invalid_mv.mv[0] = kInvalidMvValue;
+    invalid_mv.mv[1] = 0;
+    MotionVector* const motion_field_mv =
+        &frame_scratch_buffer->motion_field.mv[0][0];
+    std::fill(motion_field_mv,
+              motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
+              invalid_mv);
+  }
+
+  // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
+  // that the block parameters cache can be filled in for the last row/column
+  // without having to check for boundary conditions.
+  if (!frame_scratch_buffer->block_parameters_holder.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4)) {
+    return kStatusOutOfMemory;
+  }
+  const dsp::Dsp* const dsp =
+      dsp::GetDspTable(sequence_header.color_config.bitdepth);
+  if (dsp == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
+                 sequence_header.color_config.bitdepth);
+    return kStatusInternalError;
+  }
+
+  const int tile_count = frame_header.tile_info.tile_count;
+  assert(tile_count >= 1);
+  Vector<std::unique_ptr<Tile>> tiles;
+  if (!tiles.reserve(tile_count)) {
+    LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
+    return kStatusOutOfMemory;
+  }
+
+  if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
+    if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+      frame_scratch_buffer->residual_buffer_pool.reset(
+          new (std::nothrow) ResidualBufferPool(
+              sequence_header.use_128x128_superblock,
+              sequence_header.color_config.subsampling_x,
+              sequence_header.color_config.subsampling_y,
+              sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                         : sizeof(int32_t)));
+      if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+        LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
+        return kStatusOutOfMemory;
+      }
+    } else {
+      frame_scratch_buffer->residual_buffer_pool->Reset(
+          sequence_header.use_128x128_superblock,
+          sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                     : sizeof(int32_t));
+    }
+  }
+
+  if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->cdef_border.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_restoration &&
+      (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->loop_restoration_border.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            frame_header.upscaled_width, num_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_superres) {
+    const int pixel_size = sequence_header.color_config.bitdepth == 8
+                               ? sizeof(uint8_t)
+                               : sizeof(uint16_t);
+    const int coefficients_size = kSuperResFilterTaps *
+                                  Align(frame_header.upscaled_width, 16) *
+                                  pixel_size;
+    if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize(
+            coefficients_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeY].");
+      return kStatusOutOfMemory;
+    }
+#if LIBGAV1_MSAN
+    // Quiet SuperRes_NEON() msan warnings.
+    memset(frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(), 0,
+           coefficients_size);
+#endif
+    const int uv_coefficients_size =
+        kSuperResFilterTaps *
+        Align(SubsampledValue(frame_header.upscaled_width, 1), 16) * pixel_size;
+    if (!sequence_header.color_config.is_monochrome &&
+        sequence_header.color_config.subsampling_x != 0 &&
+        !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize(
+            uv_coefficients_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeUV].");
+      return kStatusOutOfMemory;
+    }
+#if LIBGAV1_MSAN
+    if (!sequence_header.color_config.is_monochrome &&
+        sequence_header.color_config.subsampling_x != 0) {
+      // Quiet SuperRes_NEON() msan warnings.
+      memset(frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].get(), 0,
+             uv_coefficients_size);
+    }
+#endif
+  }
+
+  if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) {
+    const int num_threads =
+        threading_strategy.post_filter_thread_pool()->num_threads() + 1;
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_threads| rows of the
+    // down-scaled pixels.
+    // Left and right borders are for line extension. They are doubled for the Y
+    // plane to make sure the U and V planes have enough space after possible
+    // subsampling.
+    if (!frame_scratch_buffer->superres_line_buffer.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_threads,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+            2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+            nullptr, nullptr, nullptr)) {
+      LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) {
+    // We can parse the current frame if all the reference frames have been
+    // parsed.
+    for (const int index : frame_header.reference_frame_index) {
+      if (!state.reference_frame[index]->WaitUntilParsed()) {
+        return kStatusUnknownError;
+      }
+    }
+  }
+
+  // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+  // a segmentation map containing all 0s.
+  const SegmentationMap* prev_segment_ids = nullptr;
+  if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+    frame_scratch_buffer->symbol_decoder_context.Initialize(
+        frame_header.quantizer.base_index);
+  } else {
+    const int index =
+        frame_header
+            .reference_frame_index[frame_header.primary_reference_frame];
+    assert(index != -1);
+    const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+    frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+    if (frame_header.segmentation.enabled &&
+        prev_frame->columns4x4() == frame_header.columns4x4 &&
+        prev_frame->rows4x4() == frame_header.rows4x4) {
+      prev_segment_ids = prev_frame->segmentation_map();
+    }
+  }
+
+  // The Tile class must make use of a separate buffer to store the unfiltered
+  // pixels for the intra prediction of the next superblock row. This is done
+  // only when one of the following conditions are true:
+  //   * is_frame_parallel_ is true.
+  //   * settings_.threads == 1.
+  // In the non-frame-parallel multi-threaded case, we do not run the post
+  // filters in the decode loop. So this buffer need not be used.
+  const bool use_intra_prediction_buffer =
+      is_frame_parallel_ || settings_.threads == 1;
+  if (use_intra_prediction_buffer) {
+    if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+            frame_header.tile_info.tile_rows)) {
+      LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+      return kStatusOutOfMemory;
+    }
+    IntraPredictionBuffer* const intra_prediction_buffers =
+        frame_scratch_buffer->intra_prediction_buffers.get();
+    for (int plane = kPlaneY; plane < num_planes; ++plane) {
+      const int subsampling =
+          (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+      const size_t intra_prediction_buffer_size =
+          ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+           (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                       : sizeof(uint16_t)));
+      for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+           ++tile_row) {
+        if (!intra_prediction_buffers[tile_row][plane].Resize(
+                intra_prediction_buffer_size)) {
+          LIBGAV1_DLOG(ERROR,
+                       "Failed to allocate intra prediction buffer for tile "
+                       "row %d plane %d.\n",
+                       tile_row, plane);
+          return kStatusOutOfMemory;
+        }
+      }
+    }
+  }
+
+  PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+                         current_frame->buffer(), dsp,
+                         settings_.post_filter_mask);
+  SymbolDecoderContext saved_symbol_decoder_context;
+  BlockingCounterWithStatus pending_tiles(tile_count);
+  for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
+    std::unique_ptr<Tile> tile = Tile::Create(
+        tile_number, tile_buffers[tile_number].data,
+        tile_buffers[tile_number].size, sequence_header, frame_header,
+        current_frame, state, frame_scratch_buffer, wedge_masks_,
+        quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids,
+        &post_filter, dsp, threading_strategy.row_thread_pool(tile_number),
+        &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer);
+    if (tile == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create tile.");
+      return kStatusOutOfMemory;
+    }
+    tiles.push_back_unchecked(std::move(tile));
+  }
+  assert(tiles.size() == static_cast<size_t>(tile_count));
+  if (is_frame_parallel_) {
+    if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+      return DecodeTilesFrameParallel(
+          sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+          prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+    }
+    return DecodeTilesThreadedFrameParallel(
+        sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+        prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+  }
+  StatusCode status;
+  if (settings_.threads == 1) {
+    status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
+                                         frame_scratch_buffer, &post_filter);
+  } else {
+    status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+                                                 &post_filter, &pending_tiles);
+  }
+  if (status != kStatusOk) return status;
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::ApplyFilmGrain(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const RefCountedBufferPtr& displayable_frame,
+    RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
+  if (!sequence_header.film_grain_params_present ||
+      !displayable_frame->film_grain_params().apply_grain ||
+      (settings_.post_filter_mask & 0x10) == 0) {
+    *film_grain_frame = displayable_frame;
+    return kStatusOk;
+  }
+  if (!frame_header.show_existing_frame &&
+      frame_header.refresh_frame_flags == 0) {
+    // If show_existing_frame is true, then the current frame is a previously
+    // saved reference frame. If refresh_frame_flags is nonzero, then the
+    // state_.UpdateReferenceFrames() call above has saved the current frame as
+    // a reference frame. Therefore, if both of these conditions are false, then
+    // the current frame is not saved as a reference frame. displayable_frame
+    // should hold the only reference to the current frame.
+    assert(displayable_frame.use_count() == 1);
+    // Add film grain noise in place.
+    *film_grain_frame = displayable_frame;
+  } else {
+    *film_grain_frame = buffer_pool_.GetFreeBuffer();
+    if (*film_grain_frame == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "Could not get film_grain_frame from the buffer pool.");
+      return kStatusResourceExhausted;
+    }
+    if (!(*film_grain_frame)
+             ->Realloc(displayable_frame->buffer()->bitdepth(),
+                       displayable_frame->buffer()->is_monochrome(),
+                       displayable_frame->upscaled_width(),
+                       displayable_frame->frame_height(),
+                       displayable_frame->buffer()->subsampling_x(),
+                       displayable_frame->buffer()->subsampling_y(),
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
+      LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
+      return kStatusOutOfMemory;
+    }
+    (*film_grain_frame)
+        ->set_chroma_sample_position(
+            displayable_frame->chroma_sample_position());
+    (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
+    (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
+  }
+  const bool color_matrix_is_identity =
+      sequence_header.color_config.matrix_coefficients ==
+      kMatrixCoefficientsIdentity;
+  assert(displayable_frame->buffer()->stride(kPlaneU) ==
+         displayable_frame->buffer()->stride(kPlaneV));
+  const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
+  assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
+         (*film_grain_frame)->buffer()->stride(kPlaneV));
+  const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (displayable_frame->buffer()->bitdepth() == 10) {
+    FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
+                             displayable_frame->buffer()->is_monochrome(),
+                             color_matrix_is_identity,
+                             displayable_frame->buffer()->subsampling_x(),
+                             displayable_frame->buffer()->subsampling_y(),
+                             displayable_frame->upscaled_width(),
+                             displayable_frame->frame_height(), thread_pool);
+    if (!film_grain.AddNoise(
+            displayable_frame->buffer()->data(kPlaneY),
+            displayable_frame->buffer()->stride(kPlaneY),
+            displayable_frame->buffer()->data(kPlaneU),
+            displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+            (*film_grain_frame)->buffer()->data(kPlaneY),
+            (*film_grain_frame)->buffer()->stride(kPlaneY),
+            (*film_grain_frame)->buffer()->data(kPlaneU),
+            (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+      LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+      return kStatusOutOfMemory;
+    }
+    return kStatusOk;
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+  if (displayable_frame->buffer()->bitdepth() == 12) {
+    FilmGrain<12> film_grain(displayable_frame->film_grain_params(),
+                             displayable_frame->buffer()->is_monochrome(),
+                             color_matrix_is_identity,
+                             displayable_frame->buffer()->subsampling_x(),
+                             displayable_frame->buffer()->subsampling_y(),
+                             displayable_frame->upscaled_width(),
+                             displayable_frame->frame_height(), thread_pool);
+    if (!film_grain.AddNoise(
+            displayable_frame->buffer()->data(kPlaneY),
+            displayable_frame->buffer()->stride(kPlaneY),
+            displayable_frame->buffer()->data(kPlaneU),
+            displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+            (*film_grain_frame)->buffer()->data(kPlaneY),
+            (*film_grain_frame)->buffer()->stride(kPlaneY),
+            (*film_grain_frame)->buffer()->data(kPlaneU),
+            (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+      LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+      return kStatusOutOfMemory;
+    }
+    return kStatusOk;
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+  FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
+                          displayable_frame->buffer()->is_monochrome(),
+                          color_matrix_is_identity,
+                          displayable_frame->buffer()->subsampling_x(),
+                          displayable_frame->buffer()->subsampling_y(),
+                          displayable_frame->upscaled_width(),
+                          displayable_frame->frame_height(), thread_pool);
+  if (!film_grain.AddNoise(
+          displayable_frame->buffer()->data(kPlaneY),
+          displayable_frame->buffer()->stride(kPlaneY),
+          displayable_frame->buffer()->data(kPlaneU),
+          displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+          (*film_grain_frame)->buffer()->data(kPlaneY),
+          (*film_grain_frame)->buffer()->stride(kPlaneY),
+          (*film_grain_frame)->buffer()->data(kPlaneU),
+          (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+    LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
+  if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
+                   [](const ObuHeader& obu_header) {
+                     return obu_header.type == kObuSequenceHeader;
+                   }) == obu.obu_headers().end()) {
+    return false;
+  }
+  const ObuSequenceHeader sequence_header = obu.sequence_header();
+  const bool sequence_header_changed =
+      !has_sequence_header_ ||
+      sequence_header_.color_config.bitdepth !=
+          sequence_header.color_config.bitdepth ||
+      sequence_header_.color_config.is_monochrome !=
+          sequence_header.color_config.is_monochrome ||
+      sequence_header_.color_config.subsampling_x !=
+          sequence_header.color_config.subsampling_x ||
+      sequence_header_.color_config.subsampling_y !=
+          sequence_header.color_config.subsampling_y ||
+      sequence_header_.max_frame_width != sequence_header.max_frame_width ||
+      sequence_header_.max_frame_height != sequence_header.max_frame_height;
+  sequence_header_ = sequence_header;
+  has_sequence_header_ = true;
+  return sequence_header_changed;
+}
+
+bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) {
+  if (IsIntraFrame(frame_type) || wedge_masks_initialized_) {
+    return true;
+  }
+  if (!GenerateWedgeMask(&wedge_masks_)) {
+    return false;
+  }
+  wedge_masks_initialized_ = true;
+  return true;
+}
+
+bool DecoderImpl::MaybeInitializeQuantizerMatrix(
+    const ObuFrameHeader& frame_header) {
+  if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
+    return true;
+  }
+  if (!InitializeQuantizerMatrix(&quantizer_matrix_)) {
+    return false;
+  }
+  quantizer_matrix_initialized_ = true;
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/decoder_impl.h b/src/decoder_impl.h
new file mode 100644 (file)
index 0000000..b75417d
--- /dev/null
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_IMPL_H_
+#define LIBGAV1_SRC_DECODER_IMPL_H_
+
+#include <array>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/constants.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/decoder_settings.h"
+#include "src/gav1/status_code.h"
+#include "src/obu_parser.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct TemporalUnit;
+
+struct EncodedFrame {
+  EncodedFrame(ObuParser* const obu, const DecoderState& state,
+               const RefCountedBufferPtr& frame, int position_in_temporal_unit)
+      : sequence_header(obu->sequence_header()),
+        frame_header(obu->frame_header()),
+        state(state),
+        temporal_unit(nullptr),
+        frame(frame),
+        position_in_temporal_unit(position_in_temporal_unit) {
+    obu->MoveTileBuffers(&tile_buffers);
+    frame->MarkFrameAsStarted();
+  }
+
+  const ObuSequenceHeader sequence_header;
+  const ObuFrameHeader frame_header;
+  Vector<TileBuffer> tile_buffers;
+  DecoderState state;
+  TemporalUnit* temporal_unit;
+  RefCountedBufferPtr frame;
+  const int position_in_temporal_unit;
+};
+
+struct TemporalUnit : public Allocable {
+  // The default constructor is invoked by the Queue<TemporalUnit>::Init()
+  // method. Queue<> does not use the default-constructed elements, so it is
+  // safe for the default constructor to not initialize the members.
+  TemporalUnit() = default;
+  TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data,
+               void* buffer_private_data)
+      : data(data),
+        size(size),
+        user_private_data(user_private_data),
+        buffer_private_data(buffer_private_data),
+        decoded(false),
+        status(kStatusOk),
+        has_displayable_frame(false),
+        output_frame_position(-1),
+        decoded_count(0),
+        output_layer_count(0),
+        released_input_buffer(false) {}
+
+  const uint8_t* data;
+  size_t size;
+  int64_t user_private_data;
+  void* buffer_private_data;
+
+  // The following members are used only in frame parallel mode.
+  bool decoded;
+  StatusCode status;
+  bool has_displayable_frame;
+  int output_frame_position;
+
+  Vector<EncodedFrame> frames;
+  size_t decoded_count;
+
+  // The struct (and the counter) is used to support output of multiple layers
+  // within a single temporal unit. The decoding process will store the output
+  // frames in |output_layers| in the order they are finished decoding. At the
+  // end of the decoding process, this array will be sorted in reverse order of
+  // |position_in_temporal_unit|. DequeueFrame() will then return the frames in
+  // reverse order (so that the entire process can run with a single counter
+  // variable).
+  struct OutputLayer {
+    // Used by std::sort to sort |output_layers| in reverse order of
+    // |position_in_temporal_unit|.
+    bool operator<(const OutputLayer& rhs) const {
+      return position_in_temporal_unit > rhs.position_in_temporal_unit;
+    }
+
+    RefCountedBufferPtr frame;
+    int position_in_temporal_unit = 0;
+  } output_layers[kMaxLayers];
+  // Number of entries in |output_layers|.
+  int output_layer_count;
+  // Flag to ensure that we release the input buffer only once if there are
+  // multiple output layers.
+  bool released_input_buffer;
+};
+
+class DecoderImpl : public Allocable {
+ public:
+  // The constructor saves a const reference to |*settings|. Therefore
+  // |*settings| must outlive the DecoderImpl object. On success, |*output|
+  // contains a pointer to the newly-created DecoderImpl object. On failure,
+  // |*output| is not modified.
+  static StatusCode Create(const DecoderSettings* settings,
+                           std::unique_ptr<DecoderImpl>* output);
+  ~DecoderImpl();
+  StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+                          int64_t user_private_data, void* buffer_private_data);
+  StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+  static constexpr int GetMaxBitdepth() {
+    static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10 ||
+                      LIBGAV1_MAX_BITDEPTH == 12,
+                  "LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.");
+    return LIBGAV1_MAX_BITDEPTH;
+  }
+
+ private:
+  explicit DecoderImpl(const DecoderSettings* settings);
+  StatusCode Init();
+  // Called when the first frame is enqueued. It does the OBU parsing for one
+  // temporal unit to retrieve the tile configuration and sets up the frame
+  // threading if frame parallel mode is allowed. It also initializes the
+  // |temporal_units_| queue based on the number of frame threads.
+  //
+  // The following are the limitations of the current implementation:
+  //  * It assumes that all frames in the video have the same tile
+  //    configuration. The frame parallel threading model will not be updated
+  //    based on tile configuration changes mid-stream.
+  //  * The above assumption holds true even when there is a new coded video
+  //    sequence (i.e.) a new sequence header.
+  StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+                                                           size_t size);
+  // Used only in frame parallel mode. Signals failure and waits until the
+  // worker threads are aborted if |status| is a failure status. If |status| is
+  // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
+  // Always returns the input parameter |status| as the return value.
+  //
+  // This function is called only from the application thread (from
+  // EnqueueFrame() and DequeueFrame()).
+  StatusCode SignalFailure(StatusCode status);
+
+  void ReleaseOutputFrame();
+
+  // Decodes all the frames contained in the given temporal unit. Used only in
+  // non frame parallel mode.
+  StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                const DecoderBuffer** out_ptr);
+  // Used only in frame parallel mode. Does the OBU parsing for |data| and
+  // schedules the individual frames for decoding in the |frame_thread_pool_|.
+  StatusCode ParseAndSchedule(const uint8_t* data, size_t size,
+                              int64_t user_private_data,
+                              void* buffer_private_data);
+  // Decodes the |encoded_frame| and updates the
+  // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a
+  // displayable frame. Used only in frame parallel mode.
+  StatusCode DecodeFrame(EncodedFrame* encoded_frame);
+
+  // Populates |buffer_| with values from |frame|. Adds a reference to |frame|
+  // in |output_frame_|.
+  StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame);
+  StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header,
+                         const ObuFrameHeader& frame_header,
+                         const Vector<TileBuffer>& tile_buffers,
+                         const DecoderState& state,
+                         FrameScratchBuffer* frame_scratch_buffer,
+                         RefCountedBuffer* current_frame);
+  // Applies film grain synthesis to the |displayable_frame| and stores the film
+  // grain applied frame into |film_grain_frame|. Returns kStatusOk on success.
+  StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header,
+                            const ObuFrameHeader& frame_header,
+                            const RefCountedBufferPtr& displayable_frame,
+                            RefCountedBufferPtr* film_grain_frame,
+                            ThreadPool* thread_pool);
+
+  bool IsNewSequenceHeader(const ObuParser& obu);
+
+  bool HasFailure() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return failure_status_ != kStatusOk;
+  }
+
+  // Initializes the |quantizer_matrix_| if necessary and sets
+  // |quantizer_matrix_initialized_| to true.
+  bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+
+  // Allocates and generates the |wedge_masks_| if necessary and sets
+  // |wedge_masks_initialized_| to true.
+  bool MaybeInitializeWedgeMasks(FrameType frame_type);
+
+  // Elements in this queue cannot be moved with std::move since the
+  // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
+  Queue<TemporalUnit> temporal_units_;
+  DecoderState state_;
+
+  DecoderBuffer buffer_ = {};
+  // |output_frame_| holds a reference to the output frame on behalf of
+  // |buffer_|.
+  RefCountedBufferPtr output_frame_;
+
+  // Queue of output frames that are to be returned in the DequeueFrame() calls.
+  // If |settings_.output_all_layers| is false, this queue will never contain
+  // more than 1 element. This queue is used only when |is_frame_parallel_| is
+  // false.
+  Queue<RefCountedBufferPtr> output_frame_queue_;
+
+  BufferPool buffer_pool_;
+  WedgeMaskArray wedge_masks_;
+  bool wedge_masks_initialized_ = false;
+  QuantizerMatrix quantizer_matrix_;
+  bool quantizer_matrix_initialized_ = false;
+  FrameScratchBufferPool frame_scratch_buffer_pool_;
+
+  // Used to synchronize the accesses into |temporal_units_| in order to update
+  // the "decoded" state of an temporal unit.
+  std::mutex mutex_;
+  std::condition_variable decoded_condvar_;
+  bool is_frame_parallel_;
+  std::unique_ptr<ThreadPool> frame_thread_pool_;
+
+  // In frame parallel mode, there are two primary points of failure:
+  //  1) ParseAndSchedule()
+  //  2) DecodeTiles()
+  // Both of these functions have to respond to the other one failing by
+  // aborting whatever they are doing. This variable is used to accomplish that.
+  // If |failure_status_| is not kStatusOk, then the two functions will try to
+  // abort as early as they can.
+  StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
+
+  ObuSequenceHeader sequence_header_ = {};
+  // If true, sequence_header is valid.
+  bool has_sequence_header_ = false;
+
+  const DecoderSettings& settings_;
+  bool seen_first_frame_ = false;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DECODER_IMPL_H_
diff --git a/src/decoder_settings.cc b/src/decoder_settings.cc
new file mode 100644 (file)
index 0000000..9399073
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_settings.h"
+
+extern "C" {
+
+void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) {
+  settings->threads = 1;
+  settings->frame_parallel = 0;    // false
+  settings->blocking_dequeue = 0;  // false
+  settings->on_frame_buffer_size_changed = nullptr;
+  settings->get_frame_buffer = nullptr;
+  settings->release_frame_buffer = nullptr;
+  settings->release_input_buffer = nullptr;
+  settings->callback_private_data = nullptr;
+  settings->output_all_layers = 0;  // false
+  settings->operating_point = 0;
+  settings->post_filter_mask = 0x1f;
+}
+
+}  // extern "C"
diff --git a/src/decoder_state.h b/src/decoder_state.h
new file mode 100644 (file)
index 0000000..ea5c792
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_STATE_H_
+#define LIBGAV1_SRC_DECODER_STATE_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+struct DecoderState {
+  // Section 7.20. Updates frames in the reference_frame array with
+  // |current_frame|, based on the |refresh_frame_flags| bitmask.
+  void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame,
+                             int refresh_frame_flags) {
+    for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
+         ++ref_index, mask >>= 1) {
+      if ((mask & 1) != 0) {
+        reference_frame_id[ref_index] = current_frame_id;
+        reference_frame[ref_index] = current_frame;
+        reference_order_hint[ref_index] = order_hint;
+      }
+    }
+  }
+
+  // Clears all the reference frames.
+  void ClearReferenceFrames() {
+    reference_frame_id = {};
+    reference_order_hint = {};
+    for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
+      reference_frame[ref_index] = nullptr;
+    }
+  }
+
+  // reference_frame_id and current_frame_id have meaningful values and are used
+  // in checks only if sequence_header_.frame_id_numbers_present is true. If
+  // sequence_header_.frame_id_numbers_present is false, reference_frame_id and
+  // current_frame_id are assigned the default value 0 and are not used in
+  // checks.
+  std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
+  // A valid value of current_frame_id is an unsigned integer of at most 16
+  // bits. -1 indicates current_frame_id is not initialized.
+  int current_frame_id = -1;
+  // The RefOrderHint array variable in the spec.
+  std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
+  // The OrderHint variable in the spec. Its value comes from either the
+  // order_hint syntax element in the uncompressed header (if
+  // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
+  // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
+  // 5.9.2 and Section 7.4.
+  //
+  // NOTE: When show_existing_frame is false, it is often more convenient to
+  // just use the order_hint field of the frame header as OrderHint. So this
+  // field is mainly used to update the reference_order_hint array in
+  // UpdateReferenceFrames().
+  uint8_t order_hint = 0;
+  // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
+  // of the motion vector in time for each reference frame.
+  // * |false| indicates that the reference frame is a forwards reference (i.e.
+  //   the reference frame is expected to be output before the current frame);
+  // * |true| indicates that the reference frame is a backwards reference.
+  // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
+  std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+  // The RefValid[i] variable in the spec does not need to be stored explicitly.
+  // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a
+  // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set
+  // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the
+  // spec is 1, then reference_frame[i] contains a frame buffer pointer.
+  std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DECODER_STATE_H_
diff --git a/src/decoder_test.cc b/src/decoder_test.cc
new file mode 100644 (file)
index 0000000..52ec5cc
--- /dev/null
@@ -0,0 +1,382 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "gtest/gtest.h"
+#include "src/decoder_test_data.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER,
+                               OBU_FRAME_1};
+
+constexpr uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2};
+
+constexpr uint8_t kFrame1WithHdrCllAndHdrMdcv[] = {
+    OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL,
+    OBU_METADATA_HDR_MDCV, OBU_FRAME_1};
+
+constexpr uint8_t kFrame2WithItutT35[] = {OBU_TEMPORAL_DELIMITER,
+                                          OBU_METADATA_ITUT_T35, OBU_FRAME_2};
+
+class DecoderTest : public testing::Test {
+ public:
+  void SetUp() override;
+  void IncrementFramesInUse() { ++frames_in_use_; }
+  void DecrementFramesInUse() { --frames_in_use_; }
+  void SetBufferPrivateData(void* buffer_private_data) {
+    buffer_private_data_ = buffer_private_data;
+  }
+  void SetReleasedInputBuffer(void* released_input_buffer) {
+    released_input_buffer_ = released_input_buffer;
+  }
+
+ protected:
+  std::unique_ptr<Decoder> decoder_;
+  int frames_in_use_ = 0;
+  void* buffer_private_data_ = nullptr;
+  void* released_input_buffer_ = nullptr;
+};
+
+struct FrameBufferPrivate {
+  uint8_t* data[3];
+};
+
+extern "C" {
+
+static Libgav1StatusCode GetFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  Libgav1FrameBufferInfo info;
+  Libgav1StatusCode status = Libgav1ComputeFrameBufferInfo(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, &info);
+  if (status != kLibgav1StatusOk) return status;
+
+  std::unique_ptr<FrameBufferPrivate> buffer_private(new (std::nothrow)
+                                                         FrameBufferPrivate);
+  if (buffer_private == nullptr) return kLibgav1StatusOutOfMemory;
+
+  for (int i = 0; i < 3; ++i) {
+    const size_t size = (i == 0) ? info.y_buffer_size : info.uv_buffer_size;
+    buffer_private->data[i] = new (std::nothrow) uint8_t[size];
+    if (buffer_private->data[i] == nullptr) {
+      return kLibgav1StatusOutOfMemory;
+    }
+  }
+
+  uint8_t* const y_buffer = buffer_private->data[0];
+  uint8_t* const u_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[1] : nullptr;
+  uint8_t* const v_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[2] : nullptr;
+
+  status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer,
+                                 buffer_private.release(), frame_buffer);
+  if (status != kLibgav1StatusOk) return status;
+
+  auto* const decoder_test = static_cast<DecoderTest*>(callback_private_data);
+  decoder_test->IncrementFramesInUse();
+  decoder_test->SetBufferPrivateData(frame_buffer->private_data);
+  return kLibgav1StatusOk;
+}
+
+static void ReleaseFrameBuffer(void* callback_private_data,
+                               void* buffer_private_data) {
+  auto* buffer_private = static_cast<FrameBufferPrivate*>(buffer_private_data);
+  for (auto& data : buffer_private->data) {
+    delete[] data;
+  }
+  delete buffer_private;
+  auto* const decoder_test = static_cast<DecoderTest*>(callback_private_data);
+  decoder_test->DecrementFramesInUse();
+}
+
+static void ReleaseInputBuffer(void* private_data, void* input_buffer) {
+  auto* const decoder_test = static_cast<DecoderTest*>(private_data);
+  decoder_test->SetReleasedInputBuffer(input_buffer);
+}
+
+}  // extern "C"
+
+void DecoderTest::SetUp() {
+  decoder_.reset(new (std::nothrow) Decoder());
+  ASSERT_NE(decoder_, nullptr);
+  DecoderSettings settings = {};
+  settings.frame_parallel = false;
+  settings.get_frame_buffer = GetFrameBuffer;
+  settings.release_frame_buffer = ReleaseFrameBuffer;
+  settings.callback_private_data = this;
+  settings.release_input_buffer = ReleaseInputBuffer;
+  ASSERT_EQ(decoder_->Init(&settings), kStatusOk);
+}
+
+TEST_F(DecoderTest, APIFlowForNonFrameParallelMode) {
+  StatusCode status;
+  const DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  // In non-frame-parallel mode, decoding happens only in the DequeueFrame call.
+  // So there should be no frames in use yet.
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  // libgav1 has decoded frame1 and is holding a reference to it.
+  EXPECT_EQ(frames_in_use_, 1);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Dequeue the output of frame2.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+  EXPECT_EQ(frames_in_use_, 2);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Signal end of stream (method 1). This should ensure that all the references
+  // are released.
+  status = decoder_->SignalEOS();
+  EXPECT_EQ(status, kStatusOk);
+
+  // libgav1 should have released all the reference frames now.
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence.
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  EXPECT_EQ(frames_in_use_, 1);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Dequeue the output of frame2.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+  EXPECT_EQ(frames_in_use_, 2);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Signal end of stream (method 2). This should ensure that all the references
+  // are released.
+  decoder_ = nullptr;
+
+  // libgav1 should have released all the frames now.
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing) {
+  StatusCode status;
+  const DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  // Until the output of frame1 is dequeued, no other frames can be enqueued.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusTryAgain);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Delete the decoder instance.
+  decoder_ = nullptr;
+
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeEOSBeforeDequeuingLastFrame) {
+  StatusCode status;
+  const DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Signal end of stream before dequeuing the output of frame2.
+  status = decoder_->SignalEOS();
+  ASSERT_EQ(status, kStatusOk);
+
+  // In this case, the output of the last frame that was enqueued is lost (which
+  // is intentional since end of stream was signaled without dequeueing it).
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeInvalidFrameAfterEOS) {
+  StatusCode status;
+  const DecoderBuffer* buffer = nullptr;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Signal end of stream.
+  status = decoder_->SignalEOS();
+  EXPECT_EQ(status, kStatusOk);
+
+  // libgav1 should have released all the reference frames now.
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence. But, we
+  // try to enqueue a frame that does not have a sequence header (which is not
+  // allowed).
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame2 (this will fail since no sequence header has
+  // been seen since the last EOS signal).
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusBitstreamError);
+  EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, MetadataObu) {
+  StatusCode status;
+  const DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(
+      kFrame1WithHdrCllAndHdrMdcv, sizeof(kFrame1WithHdrCllAndHdrMdcv), 0,
+      const_cast<uint8_t*>(kFrame1WithHdrCllAndHdrMdcv));
+  ASSERT_EQ(status, kStatusOk);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(buffer->has_hdr_cll, 1);
+  EXPECT_EQ(buffer->has_hdr_mdcv, 1);
+  EXPECT_EQ(buffer->has_itut_t35, 0);
+  EXPECT_EQ(released_input_buffer_, &kFrame1WithHdrCllAndHdrMdcv);
+
+  // libgav1 has decoded frame1 and is holding a reference to it.
+  EXPECT_EQ(frames_in_use_, 1);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status =
+      decoder_->EnqueueFrame(kFrame2WithItutT35, sizeof(kFrame2WithItutT35), 0,
+                             const_cast<uint8_t*>(kFrame2WithItutT35));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Dequeue the output of frame2.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(buffer->has_hdr_cll, 0);
+  EXPECT_EQ(buffer->has_hdr_mdcv, 0);
+  EXPECT_EQ(buffer->has_itut_t35, 1);
+  EXPECT_NE(buffer->itut_t35.payload_bytes, nullptr);
+  EXPECT_GT(buffer->itut_t35.payload_size, 0);
+  EXPECT_EQ(released_input_buffer_, &kFrame2WithItutT35);
+
+  EXPECT_EQ(frames_in_use_, 2);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  status = decoder_->SignalEOS();
+  EXPECT_EQ(status, kStatusOk);
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/decoder_test_data.h b/src/decoder_test_data.h
new file mode 100644 (file)
index 0000000..78b6b46
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2022 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_TEST_DATA_H_
+#define LIBGAV1_SRC_DECODER_TEST_DATA_H_
+
+// The bytes for these two frames come from the libaom test vector
+// av1-1-b8-01-size-32x32.ivf
+#define OBU_TEMPORAL_DELIMITER 0x12, 0x0
+#define OBU_SEQUENCE_HEADER \
+  0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc, 0xc0, 0x20
+#define OBU_FRAME_1                                                           \
+  0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x10, 0x30,    \
+      0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82, 0xf2, 0xa4, 0xa4,  \
+      0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde, 0xa8, 0x6f, 0x8d,    \
+      0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b, 0x8b, 0x0, 0xff,  \
+      0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63, 0xc2, 0xc6, 0x6e, \
+      0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72, 0xeb, 0xbb, 0x4f, \
+      0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e, 0x5e, 0x1b, 0x65,  \
+      0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8, 0xd9, 0x8e, 0x9c, \
+      0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0, 0x62, 0x69, 0xd,   \
+      0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15, 0xa3, 0xe1, 0x42,  \
+      0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc, 0x75, 0xe9, 0xe3,  \
+      0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb, 0x82, 0x87, 0x71, \
+      0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc, 0xcd, 0xe7, 0x12,  \
+      0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90, 0x45, 0x36, 0x52,  \
+      0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1, 0x26, 0x46, 0x1b, \
+      0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3, 0xf4, 0x62, 0xf4, \
+      0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb, 0xe1, 0xd6, 0x88, \
+      0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11, 0x85, 0x8e, 0xa2,  \
+      0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37, 0xc4, 0x8e, 0x9e,   \
+      0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde, 0x57, 0x86, 0xcb,  \
+      0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89, 0xeb, 0x91, 0xb3, \
+      0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51, 0x56, 0x75, 0xb3, \
+      0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4, 0xaf, 0xd6, 0x6b, \
+      0x38
+#define OBU_FRAME_2                                                          \
+  0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46, 0xa8, 0x80, 0x0, 0x3,  \
+      0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed, 0xb1, 0x51, 0x15, 0x58, 0xc7,    \
+      0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a, 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, \
+      0xa6, 0x11, 0x7, 0x49, 0x76, 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0,  \
+      0x98, 0x17, 0x3d, 0x80
+#define OBU_METADATA_HDR_CLL 0x2a, 0x06, 0x01, 0x27, 0x10, 0x0d, 0xdf, 0x80
+#define OBU_METADATA_HDR_MDCV                                                 \
+  0x2a, 0x1a, 0x02, 0xae, 0x14, 0x51, 0xec, 0x43, 0xd7, 0xb0, 0xa4, 0x26,     \
+      0x66, 0x0f, 0x5c, 0x50, 0x0d, 0x54, 0x39, 0x00, 0x0f, 0xa0, 0x00, 0x00, \
+      0x00, 0x00, 0x52, 0x80
+#define OBU_METADATA_ITUT_T35                                                  \
+  0x2a, 0xf, 0x04, 0xa6, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, \
+      0x00, 0x80, 0x00, 0x00
+
+#endif  // LIBGAV1_SRC_DECODER_TEST_DATA_H_
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
new file mode 100644 (file)
index 0000000..3603750
--- /dev/null
@@ -0,0 +1,284 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit =
+    kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline uint8x8_t AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                  const int16_t* LIBGAV1_RESTRICT
+                                      prediction_1) {
+  const int16x8_t pred0 = vld1q_s16(prediction_0);
+  const int16x8_t pred1 = vld1q_s16(prediction_1);
+  const int16x8_t res = vaddq_s16(pred0, pred1);
+  return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
+}
+
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint8_t* LIBGAV1_RESTRICT dest) {
+  int x = width;
+  do {
+    const int16x8_t pred_00 = vld1q_s16(prediction_0);
+    const int16x8_t pred_01 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
+    const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
+    const int16x8_t pred_10 = vld1q_s16(prediction_0);
+    const int16x8_t pred_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
+    const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
+    vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
+    dest += 16;
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                       const void* LIBGAV1_RESTRICT prediction_1,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = height;
+
+  if (width == 4) {
+    do {
+      const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      StoreLo4(dst, result);
+      dst += dest_stride;
+      StoreHi4(dst, result);
+      dst += dest_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+      dst += dest_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+      dst += dest_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(
+    const uint16_t* LIBGAV1_RESTRICT prediction_0,
+    const uint16_t* LIBGAV1_RESTRICT prediction_1,
+    const int32x4_t compound_offset, const uint16x8_t v_bitdepth) {
+  const uint16x8_t pred0 = vld1q_u16(prediction_0);
+  const uint16x8_t pred1 = vld1q_u16(prediction_1);
+  const uint32x4_t pred_lo =
+      vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+  const uint32x4_t pred_hi =
+      vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+  const int32x4_t offset_lo =
+      vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+  const int32x4_t offset_hi =
+      vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+  const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+  const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+  return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const uint16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint16_t* LIBGAV1_RESTRICT dest,
+                                 const int32x4_t compound_offset,
+                                 const uint16x8_t v_bitdepth) {
+  int x = width;
+  do {
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                       const void* LIBGAV1_RESTRICT prediction_1,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = height;
+
+  const ptrdiff_t dst_stride = dest_stride >> 1;
+  const int32x4_t compound_offset =
+      vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+  const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  if (width == 4) {
+    do {
+      const uint16x8_t result =
+          AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u16(dst, vget_low_u16(result));
+      dst += dst_stride;
+      vst1_u16(dst, vget_high_u16(result));
+      dst += dst_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/average_blend_neon.h b/src/dsp/arm/average_blend_neon.h
new file mode 100644 (file)
index 0000000..d13bcd6
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
new file mode 100644 (file)
index 0000000..da271f2
--- /dev/null
@@ -0,0 +1,804 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  // 00 01 02 03 04 05 06 07
+  // 00 10 11 12 13 14 15 16
+  *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
+  // 17 00 00 00 00 00 00 00
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  uint8x16_t v_d1_temp[8];
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  const uint8x16_t v_zero_16 = vdupq_n_u8(0);
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = vdupq_n_u16(0);
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint16x8_t v_zero = vdupq_n_u16(0);
+  uint16x8_t v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
+  v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
+  v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
+  v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = vdupq_n_u16(0);
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* LIBGAV1_RESTRICT const source,
+                                      ptrdiff_t stride, uint16x8_t* partial_lo,
+                                      uint16x8_t* partial_hi) {
+  const auto* src = static_cast<const uint8_t*>(source);
+
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  uint8x8_t v_src[8];
+  if (bitdepth == kBitdepth8) {
+    for (auto& v : v_src) {
+      v = vld1_u8(src);
+      src += stride;
+    }
+  } else {
+    // bitdepth - 8
+    constexpr int src_shift = (bitdepth == kBitdepth10) ? 2 : 4;
+    for (auto& v : v_src) {
+      v = vshrn_n_u16(vld1q_u16(reinterpret_cast<const uint16_t*>(src)),
+                      src_shift);
+      src += stride;
+    }
+  }
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
+  // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
+  // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
+  // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
+  // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
+  // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
+  // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
+  // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
+  // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
+  // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
+  // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
+  // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
+  // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
+  // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
+  // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
+  partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+  for (int i = 2; i < 8; ++i) {
+    partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
+  }
+
+  // partial for direction 0
+  AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
+
+  // partial for direction 1
+  AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
+
+  // partial for direction 7
+  AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
+
+  uint8x8_t v_src_reverse[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src_reverse[i] = vrev64_u8(v_src[i]);
+  }
+
+  // partial for direction 4
+  AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+  // partial for direction 3
+  AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+  // partial for direction 5
+  AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
+
+uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
+  return vmlal_u16(a, b, b);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+// Because everything is being summed into a single value the distributive
+// property allows us to mirror the division table and accumulate once.
+uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
+                  const uint32x4_t division_table[4]) {
+  uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
+  c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
+  return SumVector(c);
+}
+
+// |cost[2]| and |cost[6]| square the input and accumulate:
+// cost[2] += Square(partial[2][i])
+uint32_t SquareAccumulate(const uint16x8_t a) {
+  uint32x4_t c = Square(vget_low_u16(a));
+  c = SquareAccumulate(c, vget_high_u16(a));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+  return SumVector(c);
+}
+
+uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
+                 const uint32x4_t division_table[2]) {
+  // Remove elements 0-2.
+  uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
+  c = vaddq_u32(c, Square(vget_high_u16(a)));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+
+  c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
+  return SumVector(c);
+}
+
+template <int bitdepth>
+void CdefDirection_NEON(const void* LIBGAV1_RESTRICT const source,
+                        ptrdiff_t stride,
+                        uint8_t* LIBGAV1_RESTRICT const direction,
+                        int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+
+  uint32_t cost[8];
+  uint16x8_t partial_lo[8], partial_hi[8];
+
+  AddPartial<bitdepth>(src, stride, partial_lo, partial_hi);
+
+  cost[2] = SquareAccumulate(partial_lo[2]);
+  cost[6] = SquareAccumulate(partial_lo[6]);
+
+  const uint32x4_t division_table[4] = {
+      vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+      vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
+
+  cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+  cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+  const uint32x4_t division_table_odd[2] = {
+      vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
+
+  const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
+
+  cost[1] =
+      CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
+  cost[3] =
+      CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
+  cost[5] =
+      CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
+  cost[7] =
+      CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                   const ptrdiff_t stride, uint16x8_t* output,
+                   const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vld1q_u16(src + y_0 * stride + x_0);
+  output[1] = vld1q_u16(src - y_0 * stride - x_0);
+  output[2] = vld1q_u16(src + y_1 * stride + x_1);
+  output[3] = vld1q_u16(src - y_1 * stride - x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, uint16x8_t* output,
+                    const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
+                           vld1_u16(src + y_0 * stride + stride + x_0));
+  output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
+                           vld1_u16(src - y_0 * stride + stride - x_0));
+  output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
+                           vld1_u16(src + y_1 * stride + stride + x_1));
+  output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
+                           vld1_u16(src - y_1 * stride + stride - x_1));
+}
+
+int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
+                    const uint16x8_t threshold, const int16x8_t damping) {
+  // If reference > pixel, the difference will be negative, so convert to 0 or
+  // -1.
+  const uint16x8_t sign = vcgtq_u16(reference, pixel);
+  const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
+  const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const uint16x8_t thresh_minus_shifted_diff =
+      vqsubq_u16(threshold, shifted_diff);
+  const uint16x8_t clamp_abs_diff =
+      vminq_u16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return vreinterpretq_s16_u16(
+      vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
+}
+
+template <typename Pixel>
+uint16x8_t GetMaxPrimary(uint16x8_t* primary_val, uint16x8_t max,
+                         uint16x8_t cdef_large_value_mask) {
+  if (sizeof(Pixel) == 1) {
+    // The source is 16 bits, however, we only really care about the lower
+    // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+    // primary max has been calculated, zero out the upper 8 bits.  Use this
+    // to find the "16 bit" max.
+    const uint8x16_t max_p01 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
+                                        vreinterpretq_u8_u16(primary_val[1]));
+    const uint8x16_t max_p23 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
+                                        vreinterpretq_u8_u16(primary_val[3]));
+    const uint16x8_t max_p = vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
+    max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
+  } else {
+    // Convert kCdefLargeValue to 0 before calculating max.
+    max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
+  }
+  return max;
+}
+
+template <typename Pixel>
+uint16x8_t GetMaxSecondary(uint16x8_t* secondary_val, uint16x8_t max,
+                           uint16x8_t cdef_large_value_mask) {
+  if (sizeof(Pixel) == 1) {
+    const uint8x16_t max_s01 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
+                                        vreinterpretq_u8_u16(secondary_val[1]));
+    const uint8x16_t max_s23 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
+                                        vreinterpretq_u8_u16(secondary_val[3]));
+    const uint8x16_t max_s45 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
+                                        vreinterpretq_u8_u16(secondary_val[5]));
+    const uint8x16_t max_s67 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
+                                        vreinterpretq_u8_u16(secondary_val[7]));
+    const uint16x8_t max_s = vreinterpretq_u16_u8(
+        vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
+    max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
+  } else {
+    max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
+  }
+  return max;
+}
+
+template <typename Pixel, int width>
+void StorePixels(void* dest, ptrdiff_t dst_stride, int16x8_t result) {
+  auto* const dst8 = static_cast<uint8_t*>(dest);
+  if (sizeof(Pixel) == 1) {
+    const uint8x8_t dst_pixel = vqmovun_s16(result);
+    if (width == 8) {
+      vst1_u8(dst8, dst_pixel);
+    } else {
+      StoreLo4(dst8, dst_pixel);
+      StoreHi4(dst8 + dst_stride, dst_pixel);
+    }
+  } else {
+    const uint16x8_t dst_pixel = vreinterpretq_u16_s16(result);
+    auto* const dst16 = reinterpret_cast<uint16_t*>(dst8);
+    if (width == 8) {
+      vst1q_u16(dst16, dst_pixel);
+    } else {
+      auto* const dst16_next_row =
+          reinterpret_cast<uint16_t*>(dst8 + dst_stride);
+      vst1_u16(dst16, vget_low_u16(dst_pixel));
+      vst1_u16(dst16_next_row, vget_high_u16(dst_pixel));
+    }
+  }
+}
+
+template <int width, typename Pixel, bool enable_primary = true,
+          bool enable_secondary = true>
+void CdefFilter_NEON(const uint16_t* LIBGAV1_RESTRICT src,
+                     const ptrdiff_t src_stride, const int height,
+                     const int primary_strength, const int secondary_strength,
+                     const int damping, const int direction,
+                     void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint16x8_t cdef_large_value_mask =
+      vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
+  const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
+  const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
+
+  int16x8_t primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  // 10-bit damping range: Y: [3, 6 + 2], UV: [2, 5 + 2].
+  if (enable_primary) {
+    // 8-bit primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is
+    // necessary for UV filtering.
+    // 10-bit primary_strength: [0, 15 << 2].
+    primary_damping_shift =
+        vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
+  }
+
+  if (enable_secondary) {
+    if (sizeof(Pixel) == 1) {
+      // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+      // necessary.
+      assert(damping - FloorLog2(secondary_strength) >= 0);
+      secondary_damping_shift =
+          vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
+    } else {
+      // secondary_strength: [0, 4 << 2]
+      secondary_damping_shift =
+          vdupq_n_s16(-std::max(0, damping - FloorLog2(secondary_strength)));
+    }
+  }
+
+  constexpr int coeff_shift = (sizeof(Pixel) == 1) ? 0 : kBitdepth10 - 8;
+  const int primary_tap_0 =
+      kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][0];
+  const int primary_tap_1 =
+      kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][1];
+
+  int y = height;
+  do {
+    uint16x8_t pixel;
+    if (width == 8) {
+      pixel = vld1q_u16(src);
+    } else {
+      pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
+    }
+
+    uint16x8_t min = pixel;
+    uint16x8_t max = pixel;
+    int16x8_t sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      uint16x8_t primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, primary_val[0]);
+        min = vminq_u16(min, primary_val[1]);
+        min = vminq_u16(min, primary_val[2]);
+        min = vminq_u16(min, primary_val[3]);
+
+        max = GetMaxPrimary<Pixel>(primary_val, max, cdef_large_value_mask);
+      }
+
+      sum = Constrain(primary_val[0], pixel, primary_threshold,
+                      primary_damping_shift);
+      sum = vmulq_n_s16(sum, primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[1], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[2], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[3], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+    } else {
+      sum = vdupq_n_s16(0);
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      uint16x8_t secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, secondary_val[0]);
+        min = vminq_u16(min, secondary_val[1]);
+        min = vminq_u16(min, secondary_val[2]);
+        min = vminq_u16(min, secondary_val[3]);
+        min = vminq_u16(min, secondary_val[4]);
+        min = vminq_u16(min, secondary_val[5]);
+        min = vminq_u16(min, secondary_val[6]);
+        min = vminq_u16(min, secondary_val[7]);
+
+        max = GetMaxSecondary<Pixel>(secondary_val, max, cdef_large_value_mask);
+      }
+
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[0], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[1], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[2], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[3], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[4], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[5], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[6], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[7], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+    }
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
+    sum = vaddq_s16(sum, sum_lt_0);
+    int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
+    if (clipping_required) {
+      result = vminq_s16(result, vreinterpretq_s16_u16(max));
+      result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
+    }
+
+    StorePixels<Pixel, width>(dst, dst_stride, result);
+
+    src += (width == 8) ? src_stride : src_stride << 1;
+    dst += (width == 8) ? dst_stride : dst_stride << 1;
+    y -= (width == 8) ? 1 : 2;
+  } while (y != 0);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_NEON<kBitdepth8>;
+  dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_NEON<4, uint8_t, /*enable_primary=*/true,
+                                            /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_NEON<4, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_NEON<8, uint8_t, /*enable_primary=*/true,
+                                            /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_NEON<8, uint8_t, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_NEON<kBitdepth10>;
+  dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_NEON<4, uint16_t, /*enable_primary=*/true,
+                      /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_NEON<4, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_NEON<8, uint16_t, /*enable_primary=*/true,
+                      /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_NEON<8, uint16_t, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void CdefInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/cdef_neon.h b/src/dsp/arm/cdef_neon.h
new file mode 100644 (file)
index 0000000..ef8ed3c
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_CdefFilters LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
new file mode 100644 (file)
index 0000000..c0af2c1
--- /dev/null
@@ -0,0 +1,1224 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/utils/compiler_attributes.h"
+
+#if 0
+#include <cstdio>
+#include <string>
+
+constexpr bool kEnablePrintRegs = true;
+
+union DebugRegister {
+  int8_t i8[8];
+  int16_t i16[4];
+  int32_t i32[2];
+  uint8_t u8[8];
+  uint16_t u16[4];
+  uint32_t u32[2];
+};
+
+union DebugRegisterQ {
+  int8_t i8[16];
+  int16_t i16[8];
+  int32_t i32[4];
+  uint8_t u8[16];
+  uint16_t u16[8];
+  uint32_t u32[4];
+};
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintVect(const DebugRegister r, const char* const name, int size) {
+  int n;
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s\t: ", name);
+    if (size == 8) {
+      for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+    } else if (size == 16) {
+      for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+    } else if (size == 32) {
+      for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+// Debugging macro for 128-bit types.
+inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
+                       int size) {
+  int n;
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s\t: ", name);
+    if (size == 8) {
+      for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+    } else if (size == 16) {
+      for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+    } else if (size == 32) {
+      for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+inline void PrintReg(const int32x4x2_t val, const std::string& name) {
+  DebugRegisterQ r;
+  vst1q_s32(r.i32, val.val[0]);
+  const std::string name0 = name + std::string(".val[0]");
+  PrintVectQ(r, name0.c_str(), 32);
+  vst1q_s32(r.i32, val.val[1]);
+  const std::string name1 = name + std::string(".val[1]");
+  PrintVectQ(r, name1.c_str(), 32);
+}
+
+inline void PrintReg(const uint32x4_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u32(r.u32, val);
+  PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const uint32x2_t val, const char* name) {
+  DebugRegister r;
+  vst1_u32(r.u32, val);
+  PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const uint16x8_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u16(r.u16, val);
+  PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const uint16x4_t val, const char* name) {
+  DebugRegister r;
+  vst1_u16(r.u16, val);
+  PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const uint8x16_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u8(r.u8, val);
+  PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const uint8x8_t val, const char* name) {
+  DebugRegister r;
+  vst1_u8(r.u8, val);
+  PrintVect(r, name, 8);
+}
+
+inline void PrintReg(const int32x4_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s32(r.i32, val);
+  PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const int32x2_t val, const char* name) {
+  DebugRegister r;
+  vst1_s32(r.i32, val);
+  PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const int16x8_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s16(r.i16, val);
+  PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const int16x4_t val, const char* name) {
+  DebugRegister r;
+  vst1_s16(r.i16, val);
+  PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const int8x16_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s8(r.i8, val);
+  PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const int8x8_t val, const char* name) {
+  DebugRegister r;
+  vst1_s8(r.i8, val);
+  PrintVect(r, name, 8);
+}
+
+// Print an individual (non-vector) value in decimal format.
+inline void PrintReg(const int x, const char* name) {
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s: %d\n", name, x);
+  }
+}
+
+// Print an individual (non-vector) value in hexadecimal format.
+inline void PrintHex(const int x, const char* name) {
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s: %x\n", name, x);
+  }
+}
+
+#define PR(x) PrintReg(x, #x)
+#define PD(x) PrintReg(x, #x)
+#define PX(x) PrintHex(x, #x)
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+                        const size_t size) {
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "Shadow for %s:\n", name);
+    __msan_print_shadow(r, size);
+  }
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif  // LIBGAV1_MSAN
+
+#endif  // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
+// the values. Use caution when using this in loops because it will re-zero the
+// register before loading on every iteration.
+inline uint8x8_t Load2(const void* const buf) {
+  const uint16x4_t zero = vdup_n_u16(0);
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(
+      vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
+}
+
+template <int lane>
+inline uint16x4_t Load2(const void* const buf, uint16x4_t val) {
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u16_u32(
+      vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane));
+}
+
+// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
+// register before loading the values. Use caution when using this in loops
+// because it will re-zero the register before loading on every iteration.
+inline uint8x8_t Load4(const void* const buf) {
+  const uint32x2_t zero = vdup_n_u32(0);
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
+}
+
+// Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
+template <int lane>
+inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u8_u32(
+      vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
+}
+
+// Convenience functions for 16-bit loads from a uint8_t* source.
+inline uint16x4_t Load4U16(const void* const buf) {
+  return vld1_u16(static_cast<const uint16_t*>(buf));
+}
+
+inline uint16x8_t Load8U16(const void* const buf) {
+  return vld1q_u16(static_cast<const uint16_t*>(buf));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline uint8x8_t MaskOverreads(const uint8x8_t source,
+                               const ptrdiff_t over_read_in_bytes) {
+  uint8x8_t dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    uint8x8_t mask = vdup_n_u8(0);
+    uint8x8_t valid_element_mask = vdup_n_u8(-1);
+    const int valid_bytes =
+        std::min(8, 8 - static_cast<int>(over_read_in_bytes));
+    for (int i = 0; i < valid_bytes; ++i) {
+      // Feed ff bytes into |mask| one at a time.
+      mask = vext_u8(valid_element_mask, mask, 7);
+    }
+    dst = vand_u8(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  uint8x16_t dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    uint8x16_t mask = vdupq_n_u8(0);
+    uint8x16_t valid_element_mask = vdupq_n_u8(-1);
+    const int valid_bytes =
+        std::min(16, 16 - static_cast<int>(over_read_in_bytes));
+    for (int i = 0; i < valid_bytes; ++i) {
+      // Feed ff bytes into |mask| one at a time.
+      mask = vextq_u8(valid_element_mask, mask, 15);
+    }
+    dst = vandq_u8(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return vreinterpretq_u16_u8(
+      MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
+}
+
+inline uint8x8_t Load1MsanU8(const uint8_t* const source,
+                             const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(vld1_u8(source), over_read_in_bytes);
+}
+
+inline uint8x16_t Load1QMsanU8(const uint8_t* const source,
+                               const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreadsQ(vld1q_u8(source), over_read_in_bytes);
+}
+
+inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
+                                const ptrdiff_t over_read_in_bytes) {
+  return vreinterpretq_u16_u8(MaskOverreadsQ(
+      vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
+}
+
+inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
+                                const ptrdiff_t over_read_in_bytes) {
+  return vreinterpretq_u32_u8(MaskOverreadsQ(
+      vreinterpretq_u8_u32(vld1q_u32(source)), over_read_in_bytes));
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of the type (4 bytes in the case of uint32_t)
+// and add alignment hints to the memory access.
+template <typename T>
+inline void ValueToMem(void* const buf, T val) {
+  memcpy(buf, &val, sizeof(val));
+}
+
+// Store 4 int8_t values from the low half of an int8x8_t register.
+inline void StoreLo4(void* const buf, const int8x8_t val) {
+  ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
+}
+
+// Store 4 uint8_t values from the low half of a uint8x8_t register.
+inline void StoreLo4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
+}
+
+// Store 4 uint8_t values from the high half of a uint8x8_t register.
+inline void StoreHi4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+}
+
+// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x8_t val) {
+  ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x4_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+  vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+  vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
+inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
+#if LIBGAV1_MSAN
+  // The memory shadow is incorrect for vst4q_u16, only marking the first 16
+  // bytes of the destination as initialized. To avoid missing truly
+  // uninitialized memory, check the input vectors first, before marking the
+  // whole 64 bytes initialized. If any input vector contains unused values, it
+  // should pass through MaskOverreadsQ first.
+  __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
+  __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
+  __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
+  __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
+  vst4q_s16(static_cast<int16_t*>(buf), src);
+  __msan_unpoison(buf, sizeof(int16x8x4_t));
+#else
+  vst4q_s16(static_cast<int16_t*>(buf), src);
+#endif  // LIBGAV1_MSAN
+}
+
+//------------------------------------------------------------------------------
+// Pointer helpers.
+
+// This function adds |stride|, given as a number of bytes, to a pointer to a
+// larger type, using native pointer arithmetic.
+template <typename T>
+inline T* AddByteStride(T* ptr, const ptrdiff_t stride) {
+  return reinterpret_cast<T*>(
+      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(ptr) + stride));
+}
+
+//------------------------------------------------------------------------------
+// Multiply.
+
+// Shim vmull_high_u16 for armv7.
+inline uint32x4_t VMullHighU16(const uint16x8_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+  return vmull_high_u16(a, b);
+#else
+  return vmull_u16(vget_high_u16(a), vget_high_u16(b));
+#endif
+}
+
+// Shim vmull_high_s16 for armv7.
+inline int32x4_t VMullHighS16(const int16x8_t a, const int16x8_t b) {
+#if defined(__aarch64__)
+  return vmull_high_s16(a, b);
+#else
+  return vmull_s16(vget_high_s16(a), vget_high_s16(b));
+#endif
+}
+
+// Shim vmlal_high_u16 for armv7.
+inline uint32x4_t VMlalHighU16(const uint32x4_t a, const uint16x8_t b,
+                               const uint16x8_t c) {
+#if defined(__aarch64__)
+  return vmlal_high_u16(a, b, c);
+#else
+  return vmlal_u16(a, vget_high_u16(b), vget_high_u16(c));
+#endif
+}
+
+// Shim vmlal_high_s16 for armv7.
+inline int32x4_t VMlalHighS16(const int32x4_t a, const int16x8_t b,
+                              const int16x8_t c) {
+#if defined(__aarch64__)
+  return vmlal_high_s16(a, b, c);
+#else
+  return vmlal_s16(a, vget_high_s16(b), vget_high_s16(c));
+#endif
+}
+
+// Shim vmul_laneq_u16 for armv7.
+template <int lane>
+inline uint16x4_t VMulLaneQU16(const uint16x4_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+  return vmul_laneq_u16(a, b, lane);
+#else
+  if (lane < 4) return vmul_lane_u16(a, vget_low_u16(b), lane & 0x3);
+  return vmul_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmulq_laneq_u16 for armv7.
+template <int lane>
+inline uint16x8_t VMulQLaneQU16(const uint16x8_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+  return vmulq_laneq_u16(a, b, lane);
+#else
+  if (lane < 4) return vmulq_lane_u16(a, vget_low_u16(b), lane & 0x3);
+  return vmulq_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmla_laneq_u16 for armv7.
+template <int lane>
+inline uint16x4_t VMlaLaneQU16(const uint16x4_t a, const uint16x4_t b,
+                               const uint16x8_t c) {
+#if defined(__aarch64__)
+  return vmla_laneq_u16(a, b, c, lane);
+#else
+  if (lane < 4) return vmla_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
+  return vmla_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmlaq_laneq_u16 for armv7.
+template <int lane>
+inline uint16x8_t VMlaQLaneQU16(const uint16x8_t a, const uint16x8_t b,
+                                const uint16x8_t c) {
+#if defined(__aarch64__)
+  return vmlaq_laneq_u16(a, b, c, lane);
+#else
+  if (lane < 4) return vmlaq_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
+  return vmlaq_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Bit manipulation.
+
+// vshXX_n_XX() requires an immediate.
+template <int shift>
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
+  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
+  return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
+  return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
+}
+
+// Shim vqtbl1_u8 for armv7.
+inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl1_u8(a, index);
+#else
+  const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
+  return vtbl2_u8(b, index);
+#endif
+}
+
+// Shim vqtbl2_u8 for armv7.
+inline uint8x8_t VQTbl2U8(const uint8x16x2_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl2_u8(a, index);
+#else
+  const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
+                         vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
+  return vtbl4_u8(b, index);
+#endif
+}
+
+// Shim vqtbl2q_u8 for armv7.
+inline uint8x16_t VQTbl2QU8(const uint8x16x2_t a, const uint8x16_t index) {
+#if defined(__aarch64__)
+  return vqtbl2q_u8(a, index);
+#else
+  return vcombine_u8(VQTbl2U8(a, vget_low_u8(index)),
+                     VQTbl2U8(a, vget_high_u8(index)));
+#endif
+}
+
+// Shim vqtbl3q_u8 for armv7.
+inline uint8x8_t VQTbl3U8(const uint8x16x3_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl3_u8(a, index);
+#else
+  const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
+                         vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
+  const uint8x8x2_t c = {vget_low_u8(a.val[2]), vget_high_u8(a.val[2])};
+  const uint8x8_t index_ext = vsub_u8(index, vdup_n_u8(32));
+  const uint8x8_t partial_lookup = vtbl4_u8(b, index);
+  return vtbx2_u8(partial_lookup, c, index_ext);
+#endif
+}
+
+// Shim vqtbl3q_u8 for armv7.
+inline uint8x16_t VQTbl3QU8(const uint8x16x3_t a, const uint8x16_t index) {
+#if defined(__aarch64__)
+  return vqtbl3q_u8(a, index);
+#else
+  return vcombine_u8(VQTbl3U8(a, vget_low_u8(index)),
+                     VQTbl3U8(a, vget_high_u8(index)));
+#endif
+}
+
+// Shim vqtbl1_s8 for armv7.
+inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl1_s8(a, index);
+#else
+  const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
+  return vtbl2_s8(b, vreinterpret_s8_u8(index));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Saturation helpers.
+
+inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
+                          const int16x4_t high) {
+  return vmin_s16(vmax_s16(val, low), high);
+}
+
+inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
+                          const int16x8_t high) {
+  return vminq_s16(vmaxq_s16(val, low), high);
+}
+
+inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
+  const int16x8_t low = vdupq_n_s16(0);
+  const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
+//------------------------------------------------------------------------------
+// Interleave.
+
+// vzipN is exclusive to A64.
+inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vzip1_u8(a, b);
+#else
+  // Discard |.val[1]|
+  return vzip_u8(a, b).val[0];
+#endif
+}
+
+inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_u8_u32(
+      vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+  // Discard |.val[1]|
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
+#endif
+}
+
+inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_s8_u32(
+      vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+  // Discard |.val[1]|
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
+#endif
+}
+
+inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_u8_u32(
+      vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+  // Discard |.val[0]|
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
+#endif
+}
+
+inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_s8_u32(
+      vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+  // Discard |.val[0]|
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Sum.
+
+inline uint16_t SumVector(const uint8x8_t a) {
+#if defined(__aarch64__)
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t c = vpaddl_u8(a);
+  const uint32x2_t d = vpaddl_u16(c);
+  const uint64x1_t e = vpaddl_u32(d);
+  return static_cast<uint16_t>(vget_lane_u64(e, 0));
+#endif  // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif  // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+  return static_cast<uint32_t>(vget_lane_u64(c, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Transpose.
+
+// Transpose 32 bit elements such that:
+// a: 00 01
+// b: 02 03
+// returns
+// val[0]: 00 02
+// val[1]: 01 03
+inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
+  const uint32x2_t a_32 = vreinterpret_u32_u8(a);
+  const uint32x2_t b_32 = vreinterpret_u32_u8(b);
+  const uint32x2x2_t c = vtrn_u32(a_32, b_32);
+  const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
+                         vreinterpret_u8_u32(c.val[1])};
+  return d;
+}
+
+// Swap high and low 32 bit elements.
+inline uint8x8_t Transpose32(const uint8x8_t a) {
+  const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
+  return vreinterpret_u8_u32(b);
+}
+
+// Swap high and low halves.
+inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
+
+// Implement vtrnq_s64().
+// Input:
+// a0: 00 01 02 03 04 05 06 07
+// a1: 16 17 18 19 20 21 22 23
+// Output:
+// b0.val[0]: 00 01 02 03 16 17 18 19
+// b0.val[1]: 04 05 06 07 20 21 22 23
+inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
+  int16x8x2_t b0;
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
+  uint16x8x2_t b0;
+  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+                           vreinterpret_u16_u32(vget_low_u32(a1)));
+  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+                           vreinterpret_u16_u32(vget_high_u32(a1)));
+  return b0;
+}
+
+// Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+// Output:
+// 00 10 20 30
+// 01 11 21 31
+// 02 12 22 32
+// 03 13 23 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+  // b:
+  // 00 10 02 12
+  // 01 11 03 13
+  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+  // c:
+  // 20 30 22 32
+  // 21 31 23 33
+  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+  // d:
+  // 00 10 20 30
+  // 02 12 22 32
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+  // e:
+  // 01 11 21 31
+  // 03 13 23 33
+  const uint32x2x2_t e =
+      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+  a[0] = vreinterpret_u16_u32(d.val[0]);
+  a[1] = vreinterpret_u16_u32(e.val[0]);
+  a[2] = vreinterpret_u16_u32(d.val[1]);
+  a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
+// a: 00 01 02 03 10 11 12 13
+// b: 20 21 22 23 30 31 32 33
+// Output:
+// Note that columns [1] and [2] are transposed.
+// a: 00 10 20 30 02 12 22 32
+// b: 01 11 21 31 03 13 23 33
+inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
+  const uint16x4x2_t c =
+      vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
+  const uint8x8x2_t e =
+      vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
+  *a = e.val[0];
+  *b = e.val[1];
+}
+
+// 4x8 Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 00 10 20 30 04 14 24 34
+// a[1]: 01 11 21 31 05 15 25 35
+// a[2]: 02 12 22 32 06 16 26 36
+// a[3]: 03 13 23 33 07 17 27 37
+inline void Transpose4x8(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  a[0] = vreinterpretq_u16_u32(c0.val[0]);
+  a[1] = vreinterpretq_u16_u32(c1.val[0]);
+  a[2] = vreinterpretq_u16_u32(c0.val[1]);
+  a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q:  p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34  p0q0
+// a[1]: 02 12 22 32 05 15 25 35  p1q1
+// a[2]: 01 11 21 31 06 16 26 36  p2q2
+// a[3]: 00 10 20 30 07 17 27 37  p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q:  p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard Transpose4x8 will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+inline void LoopFilterTranspose4x8(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // Reverse odd vectors to bring the appropriate items to the front of zips.
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // r0       : 03 13 01 11 07 17 05 15
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // r1       : 23 33 21 31 27 37 25 35
+  const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+  const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+  // Zip to complete the halves.
+  // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
+  // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
+  // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
+  // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
+  const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+  // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
+  // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
+  // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
+  // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
+  const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c1.val[1]);
+  // The third row of c comes first here to swap p2 with q0.
+  const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c0.val[1]);
+
+  // 8x4 Output:
+  // a[0]: 03 13 23 33 04 14 24 34  p0q0
+  // a[1]: 02 12 22 32 05 15 25 35  p1q1
+  // a[2]: 01 11 21 31 06 16 26 36  p2q2
+  // a[3]: 00 10 20 30 07 17 27 37  p3q3
+  a[0] = d1.val[0];  // p0q0
+  a[1] = d0.val[1];  // p1q1
+  a[2] = d1.val[1];  // p2q2
+  a[3] = d0.val[0];  // p3q3
+}
+
+// Reversible if the x4 values are packed next to each other.
+// x4 input / x8 output:
+// a0: 00 01 02 03 40 41 42 43 44
+// a1: 10 11 12 13 50 51 52 53 54
+// a2: 20 21 22 23 60 61 62 63 64
+// a3: 30 31 32 33 70 71 72 73 74
+// x8 input / x4 output:
+// a0: 00 10 20 30 40 50 60 70
+// a1: 01 11 21 31 41 51 61 71
+// a2: 02 12 22 32 42 52 62 72
+// a3: 03 13 23 33 43 53 63 73
+inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
+                         uint8x8_t* a3) {
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int8x8_t a[8]) {
+  // Swap 8 bit elements. Goes from:
+  // a[0]: 00 01 02 03 04 05 06 07
+  // a[1]: 10 11 12 13 14 15 16 17
+  // a[2]: 20 21 22 23 24 25 26 27
+  // a[3]: 30 31 32 33 34 35 36 37
+  // a[4]: 40 41 42 43 44 45 46 47
+  // a[5]: 50 51 52 53 54 55 56 57
+  // a[6]: 60 61 62 63 64 65 66 67
+  // a[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+  const int8x16x2_t b0 =
+      vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
+  const int8x16x2_t b1 =
+      vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+  const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
+                                   vreinterpretq_s16_s8(b1.val[0]));
+  const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
+                                   vreinterpretq_s16_s8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+
+  a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
+  a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
+  a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
+  a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
+  a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
+  a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
+  a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
+  a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
+}
+
+// Unsigned.
+inline void Transpose8x8(uint8x8_t a[8]) {
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
+  const uint8x16x2_t a0 =
+      vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
+  const uint8x16x2_t a1 =
+      vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
+
+  const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
+                                    vreinterpretq_u16_u8(a1.val[0]));
+  const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
+                                    vreinterpretq_u16_u8(a1.val[1]));
+
+  const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  out[0] = vreinterpretq_u8_u32(c0.val[0]);
+  out[1] = vreinterpretq_u8_u32(c1.val[0]);
+  out[2] = vreinterpretq_u8_u32(c0.val[1]);
+  out[3] = vreinterpretq_u8_u32(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int16x8_t a[8]) {
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+  a[0] = d0.val[0];
+  a[1] = d1.val[0];
+  a[2] = d2.val[0];
+  a[3] = d3.val[0];
+  a[4] = d0.val[1];
+  a[5] = d1.val[1];
+  a[6] = d2.val[1];
+  a[7] = d3.val[1];
+}
+
+// Unsigned.
+inline void Transpose8x8(uint16x8_t a[8]) {
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+  const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
+  const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
+  const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
+  const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
+  const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
+
+  a[0] = d0.val[0];
+  a[1] = d1.val[0];
+  a[2] = d2.val[0];
+  a[3] = d3.val[0];
+  a[4] = d0.val[1];
+  a[5] = d1.val[1];
+  a[6] = d2.val[1];
+  a[7] = d3.val[1];
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
+// a[1]: 10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
+// a[2]: 20 21 22 23 24 25 26 27  a0 a1 a2 a3 a4 a5 a6 a7
+// a[3]: 30 31 32 33 34 35 36 37  b0 b1 b2 b3 b4 b5 b6 b7
+// a[4]: 40 41 42 43 44 45 46 47  c0 c1 c2 c3 c4 c5 c6 c7
+// a[5]: 50 51 52 53 54 55 56 57  d0 d1 d2 d3 d4 d5 d6 d7
+// a[6]: 60 61 62 63 64 65 66 67  e0 e1 e2 e3 e4 e5 e6 e7
+// a[7]: 70 71 72 73 74 75 76 77  f0 f1 f2 f3 f4 f5 f6 f7
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+// a[1]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+// a[2]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+// a[3]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+// a[4]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+// a[5]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+// a[6]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+// a[7]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+inline void Transpose8x16(uint8x16_t a[8]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
+  // b0.val[1]: 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
+  // b1.val[0]: 20 30 22 32 24 34 26 36  a0 b0 a2 b2 a4 b4 a6 b6
+  // b1.val[1]: 21 31 23 33 25 35 27 37  a1 b1 a3 b3 a5 b5 a7 b7
+  // b2.val[0]: 40 50 42 52 44 54 46 56  c0 d0 c2 d2 c4 d4 c6 d6
+  // b2.val[1]: 41 51 43 53 45 55 47 57  c1 d1 c3 d3 c5 d5 c7 d7
+  // b3.val[0]: 60 70 62 72 64 74 66 76  e0 f0 e2 f2 e4 f4 e6 f6
+  // b3.val[1]: 61 71 63 73 65 75 67 77  e1 f1 e3 f3 e5 f5 e7 f7
+  const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
+  const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
+  const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
+  const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34  80 90 a0 b0 84 94 a4 b4
+  // c0.val[1]: 02 12 22 32 06 16 26 36  82 92 a2 b2 86 96 a6 b6
+  // c1.val[0]: 01 11 21 31 05 15 25 35  81 91 a1 b1 85 95 a5 b5
+  // c1.val[1]: 03 13 23 33 07 17 27 37  83 93 a3 b3 87 97 a7 b7
+  // c2.val[0]: 40 50 60 70 44 54 64 74  c0 d0 e0 f0 c4 d4 e4 f4
+  // c2.val[1]: 42 52 62 72 46 56 66 76  c2 d2 e2 f2 c6 d6 e6 f6
+  // c3.val[0]: 41 51 61 71 45 55 65 75  c1 d1 e1 f1 c5 d5 e5 f5
+  // c3.val[1]: 43 53 63 73 47 57 67 77  c3 d3 e3 f3 c7 d7 e7 f7
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+  const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+                                    vreinterpretq_u16_u8(b3.val[0]));
+  const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+                                    vreinterpretq_u16_u8(b3.val[1]));
+
+  // d0.val[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+  // d0.val[1]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+  // d1.val[0]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+  // d1.val[1]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+  // d2.val[0]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+  // d2.val[1]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+  // d3.val[0]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+  // d3.val[1]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c2.val[0]));
+  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+                                    vreinterpretq_u32_u16(c3.val[0]));
+  const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c2.val[1]));
+  const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+                                    vreinterpretq_u32_u16(c3.val[1]));
+
+  a[0] = vreinterpretq_u8_u32(d0.val[0]);
+  a[1] = vreinterpretq_u8_u32(d1.val[0]);
+  a[2] = vreinterpretq_u8_u32(d2.val[0]);
+  a[3] = vreinterpretq_u8_u32(d3.val[0]);
+  a[4] = vreinterpretq_u8_u32(d0.val[1]);
+  a[5] = vreinterpretq_u8_u32(d1.val[1]);
+  a[6] = vreinterpretq_u8_u32(d2.val[1]);
+  a[7] = vreinterpretq_u8_u32(d3.val[1]);
+}
+
+inline int16x8_t ZeroExtend(const uint8x8_t in) {
+  return vreinterpretq_s16_u16(vmovl_u8(in));
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_ENABLE_NEON
+#endif  // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
diff --git a/src/dsp/arm/common_neon_test.cc b/src/dsp/arm/common_neon_test.cc
new file mode 100644 (file)
index 0000000..03aed19
--- /dev/null
@@ -0,0 +1,208 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/common_neon.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <cstdint>
+
+#include "tests/block_utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockWidth = 16;
+constexpr int kMaxBlockHeight = 16;
+
+template <typename Pixel>
+class TransposeTest : public testing::Test {
+ public:
+  TransposeTest() {
+    for (int y = 0; y < kMaxBlockHeight; ++y) {
+      for (int x = 0; x < kMaxBlockWidth; ++x) {
+        src_block_[y][x] = y * 16 + x;
+        expected_transpose_[y][x] = x * 16 + y;
+      }
+    }
+  }
+
+  TransposeTest(const TransposeTest&) = delete;
+  TransposeTest& operator=(const TransposeTest&) = delete;
+  ~TransposeTest() override = default;
+
+ protected:
+  Pixel src_block_[kMaxBlockHeight][kMaxBlockWidth];
+  Pixel expected_transpose_[kMaxBlockHeight][kMaxBlockWidth];
+};
+
+using TransposeTestLowBitdepth = TransposeTest<uint8_t>;
+
+TEST_F(TransposeTestLowBitdepth, Transpose4x4Test) {
+  uint8x8_t a = Load4<1>(src_block_[1], Load4(src_block_[0]));
+  uint8x8_t b = Load4<1>(src_block_[3], Load4(src_block_[2]));
+  Transpose4x4(&a, &b);
+  uint8_t output_4x4[4][4];
+  StoreLo4(output_4x4[0], a);
+  StoreLo4(output_4x4[1], b);
+  StoreHi4(output_4x4[2], a);
+  StoreHi4(output_4x4[3], b);
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x4[0],
+                                        4, 4, kMaxBlockWidth, 4, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x4Test) {
+  uint8x8_t a0 = Load4<1>(src_block_[4], Load4(src_block_[0]));
+  uint8x8_t a1 = Load4<1>(src_block_[5], Load4(src_block_[1]));
+  uint8x8_t a2 = Load4<1>(src_block_[6], Load4(src_block_[2]));
+  uint8x8_t a3 = Load4<1>(src_block_[7], Load4(src_block_[3]));
+  Transpose8x4(&a0, &a1, &a2, &a3);
+  uint8_t output_8x4[4][8];
+  vst1_u8(output_8x4[0], a0);
+  vst1_u8(output_8x4[1], a1);
+  vst1_u8(output_8x4[2], a2);
+  vst1_u8(output_8x4[3], a3);
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x4[0],
+                                        8, 4, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x8Test) {
+  uint8x8_t input_8x8[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x8[i] = vld1_u8(src_block_[i]);
+  }
+  Transpose8x8(input_8x8);
+  uint8_t output_8x8[8][8];
+  for (int i = 0; i < 8; ++i) {
+    vst1_u8(output_8x8[i], input_8x8[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+                                        8, 8, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x16Test) {
+  uint8x16_t input_8x16[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x16[i] =
+        vcombine_u8(vld1_u8(src_block_[i]), vld1_u8(src_block_[i + 8]));
+  }
+  Transpose8x16(input_8x16);
+  uint8_t output_16x8[8][16];
+  for (int i = 0; i < 8; ++i) {
+    vst1q_u8(output_16x8[i], input_8x16[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_16x8[0],
+                                        16, 8, kMaxBlockWidth, 16, false));
+}
+
+using TransposeTestHighBitdepth = TransposeTest<uint16_t>;
+
+TEST_F(TransposeTestHighBitdepth, Transpose4x4Test) {
+  uint16x4_t input_4x4[4];
+  input_4x4[0] = vld1_u16(src_block_[0]);
+  input_4x4[1] = vld1_u16(src_block_[1]);
+  input_4x4[2] = vld1_u16(src_block_[2]);
+  input_4x4[3] = vld1_u16(src_block_[3]);
+  Transpose4x4(input_4x4);
+  uint16_t output_4x4[4][4];
+  for (int i = 0; i < 4; ++i) {
+    vst1_u16(output_4x4[i], input_4x4[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x4[0],
+                                        4, 4, kMaxBlockWidth, 4, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose4x8Test) {
+  uint16x8_t input_4x8[4];
+  for (int i = 0; i < 4; ++i) {
+    input_4x8[i] = vld1q_u16(src_block_[i]);
+  }
+  Transpose4x8(input_4x8);
+  uint16_t output_4x8[4][8];
+  for (int i = 0; i < 4; ++i) {
+    vst1q_u16(output_4x8[i], input_4x8[i]);
+    memcpy(&expected_transpose_[i][4], &expected_transpose_[i + 4][0],
+           4 * sizeof(expected_transpose_[0][0]));
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x8[0],
+                                        8, 4, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, LoopFilterTranspose4x8Test) {
+  uint16x8_t input_4x8[4];
+  for (int i = 0; i < 4; ++i) {
+    input_4x8[i] = vld1q_u16(src_block_[i]);
+  }
+  LoopFilterTranspose4x8(input_4x8);
+  uint16_t output_4x8[4][8];
+  for (int i = 0; i < 4; ++i) {
+    vst1q_u16(output_4x8[i], input_4x8[i]);
+  }
+  // a[0]: 03 13 23 33 04 14 24 34  p0q0
+  // a[1]: 02 12 22 32 05 15 25 35  p1q1
+  // a[2]: 01 11 21 31 06 16 26 36  p2q2
+  // a[3]: 00 10 20 30 07 17 27 37  p3q3
+  static constexpr uint16_t expected_output[4][8] = {
+      {0x03, 0x13, 0x23, 0x33, 0x04, 0x14, 0x24, 0x34},
+      {0x02, 0x12, 0x22, 0x32, 0x05, 0x15, 0x25, 0x35},
+      {0x01, 0x11, 0x21, 0x31, 0x06, 0x16, 0x26, 0x36},
+      {0x00, 0x10, 0x20, 0x30, 0x07, 0x17, 0x27, 0x37},
+  };
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_output[0], output_4x8[0], 8, 4,
+                                        8, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose8x8Test) {
+  uint16x8_t input_8x8[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x8[i] = vld1q_u16(src_block_[i]);
+  }
+  Transpose8x8(input_8x8);
+  uint16_t output_8x8[8][8];
+  for (int i = 0; i < 8; ++i) {
+    vst1q_u16(output_8x8[i], input_8x8[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+                                        8, 8, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose8x8SignedTest) {
+  int16x8_t input_8x8[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x8[i] = vreinterpretq_s16_u16(vld1q_u16(src_block_[i]));
+  }
+  Transpose8x8(input_8x8);
+  uint16_t output_8x8[8][8];
+  for (int i = 0; i < 8; ++i) {
+    vst1q_u16(output_8x8[i], vreinterpretq_u16_s16(input_8x8[i]));
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+                                        8, 8, kMaxBlockWidth, 8, false));
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+TEST(CommonDspTest, NEON) {
+  GTEST_SKIP()
+      << "Build this module for Arm with NEON enabled to enable the tests.";
+}
+
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc
new file mode 100644 (file)
index 0000000..1aa0cc7
--- /dev/null
@@ -0,0 +1,3000 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Output of ConvolveTest.ShowRange below.
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   Horizontal base upscaled range:    [  -28644,    94116]
+//   Horizontal halved upscaled range:  [  -14322,    47085]
+//   Horizontal downscaled range:       [   -7161,    23529]
+//   Vertical upscaled range:           [-1317624,  2365176]
+//   Pixel output range:                [       0,     1023]
+//   Compound output range:             [    3988,    61532]
+
+template <int num_taps>
+int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
+                           const int16x4_t* const taps) {
+  const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
+  int32x4x2_t sum;
+  if (num_taps == 6) {
+    // 6 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+  } else if (num_taps == 8) {
+    // 8 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[6]), taps[6]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[7]), taps[7]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
+  } else if (num_taps == 2) {
+    // 2 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+  } else {
+    // 4 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+  }
+  return sum;
+}
+
+template <int num_taps>
+int32x4_t SumOnePassTaps(const uint16x4_t* const src,
+                         const int16x4_t* const taps) {
+  const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
+  int32x4_t sum;
+  if (num_taps == 6) {
+    // 6 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+    sum = vmlal_s16(sum, ssrc[4], taps[4]);
+    sum = vmlal_s16(sum, ssrc[5], taps[5]);
+  } else if (num_taps == 8) {
+    // 8 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+    sum = vmlal_s16(sum, ssrc[4], taps[4]);
+    sum = vmlal_s16(sum, ssrc[5], taps[5]);
+    sum = vmlal_s16(sum, ssrc[6], taps[6]);
+    sum = vmlal_s16(sum, ssrc[7], taps[7]);
+  } else if (num_taps == 2) {
+    // 2 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+  } else {
+    // 4 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+  }
+  return sum;
+}
+
+template <int num_taps, bool is_compound, bool is_2d>
+void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+                                 const ptrdiff_t src_stride,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t pred_stride, const int width,
+                                 const int height,
+                                 const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  if (is_2d) {
+    int x = 0;
+    do {
+      const uint16_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint16x8_t src_long = vld1q_u16(s);
+        const uint16x8_t src_long_hi = vld1q_u16(s + 8);
+        uint16x8_t v_src[8];
+        int32x4x2_t v_sum;
+        if (num_taps == 6) {
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+          v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+          v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+          v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+        } else if (num_taps == 8) {
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+          v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+          v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+          v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+          v_src[6] = vextq_u16(src_long, src_long_hi, 6);
+          v_src[7] = vextq_u16(src_long, src_long_hi, 7);
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+        } else if (num_taps == 2) {
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+        } else {  // 4 taps
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+          v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
+        }
+
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(&dest16[0], vreinterpret_u16_s16(d0));
+        vst1_u16(&dest16[4], vreinterpret_u16_s16(d1));
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+    return;
+  }
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const uint16x8_t src_long = vld1q_u16(src + x);
+      const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
+      uint16x8_t v_src[8];
+      int32x4x2_t v_sum;
+      if (num_taps == 6) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+        v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+      } else if (num_taps == 8) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+        v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+        v_src[6] = vextq_u16(src_long, src_long_hi, 6);
+        v_src[7] = vextq_u16(src_long, src_long_hi, 7);
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+      } else if (num_taps == 2) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+      } else {  // 4 taps
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
+      }
+      if (is_compound) {
+        const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(&dest16[x],
+                 vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+        vst1_u16(&dest16[x + 4],
+                 vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+      } else {
+        // Normally the Horizontal pass does the downshift in two passes:
+        // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+        // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+        // Combining them requires adding the rounding offset from the skipped
+        // shift.
+        const int32x4_t v_first_shift_rounding_bit =
+            vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+        v_sum.val[0] = vaddq_s32(v_sum.val[0], v_first_shift_rounding_bit);
+        v_sum.val[1] = vaddq_s32(v_sum.val[1], v_first_shift_rounding_bit);
+        const uint16x4_t d0 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+        const uint16x4_t d1 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+        vst1_u16(&dest16[x], d0);
+        vst1_u16(&dest16[x + 4], d1);
+      }
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int num_taps, bool is_compound, bool is_2d>
+void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  int y = height;
+  do {
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    uint16x4_t v_src[4];
+    int32x4_t v_sum;
+    const uint16x8_t src_long = vld1q_u16(src);
+    v_src[0] = vget_low_u16(src_long);
+    if (num_taps == 2) {
+      v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+      v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+      v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
+      v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
+      v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
+    }
+    if (is_compound || is_2d) {
+      const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+      if (is_compound && !is_2d) {
+        vst1_u16(&dest16[0], vreinterpret_u16_s16(
+                                 vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+      } else {
+        vst1_u16(&dest16[0], vreinterpret_u16_s16(d0));
+      }
+    } else {
+      const int32x4_t v_first_shift_rounding_bit =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+      v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      vst1_u16(&dest16[0], d0);
+    }
+    src += src_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int num_taps, bool is_2d>
+void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  int y = height >> 1;
+  do {
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t input0 = vreinterpretq_s16_u16(vld1q_u16(src));
+    const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride));
+    const int16x8x2_t input = vzipq_s16(input0, input1);
+    int32x4_t v_sum;
+    if (num_taps == 2) {
+      v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
+      v_sum = vmlal_s16(v_sum,
+                        vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
+                        v_tap[4]);
+    } else {
+      v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[2]);
+      v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 2)),
+                        v_tap[3]);
+      v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 4)),
+                        v_tap[4]);
+      v_sum = vmlal_s16(v_sum,
+                        vget_low_s16(vextq_s16(input.val[0], input.val[1], 6)),
+                        v_tap[5]);
+    }
+    if (is_2d) {
+      const uint16x4_t d0 = vreinterpret_u16_s16(
+          vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vget_lane_u16(d0, 0);
+      dest16[1] = vget_lane_u16(d0, 2);
+      dest16 += pred_stride;
+      dest16[0] = vget_lane_u16(d0, 1);
+      dest16[1] = vget_lane_u16(d0, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      const int32x4_t v_first_shift_rounding_bit =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+      v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      dest16[0] = vget_lane_u16(d0, 0);
+      dest16[1] = vget_lane_u16(d0, 2);
+      dest16 += pred_stride;
+      dest16[0] = vget_lane_u16(d0, 1);
+      dest16[1] = vget_lane_u16(d0, 3);
+      dest16 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src));
+    int32x4_t v_sum;
+    if (num_taps == 2) {
+      v_sum = vmull_s16(vget_low_s16(input), v_tap[3]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]);
+    } else {
+      v_sum = vmull_s16(vget_low_s16(input), v_tap[2]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[3]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 2)), v_tap[4]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 3)), v_tap[5]);
+    }
+    const uint16x4_t d0 = vreinterpret_u16_s16(
+        vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, d0);
+  }
+}
+
+template <int num_taps, bool is_compound, bool is_2d>
+void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const int16x4_t* const v_tap) {
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(num_taps == 2 || num_taps == 4);
+  if (num_taps == 2 || num_taps == 4) {
+    if (width == 2 && !is_compound) {
+      FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest,
+                                              pred_stride, height, v_tap);
+      return;
+    }
+    assert(width == 4);
+    FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
+        src, src_stride, dest, pred_stride, height, v_tap);
+  } else {
+    assert(false);
+  }
+}
+
+template <bool is_compound = false, bool is_2d = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  // Duplicate the absolute value for each tap.  Negative taps are corrected
+  // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
+  int16x4_t v_tap[kSubPixelTaps];
+  assert(filter_id != 0);
+
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]);
+  }
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  if (width >= 8) {
+    if (filter_index == 2) {  // 8 tap.
+      FilterHorizontalWidth8AndUp<8, is_compound, is_2d>(
+          src, src_stride, dst, dst_stride, width, height, v_tap);
+    } else if (filter_index < 2) {  // 6 tap.
+      FilterHorizontalWidth8AndUp<6, is_compound, is_2d>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+    } else {  // 2 tap.
+      assert(filter_index == 3);
+      FilterHorizontalWidth8AndUp<2, is_compound, is_2d>(
+          src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+    }
+  } else {
+    if ((filter_index & 0x4) != 0) {  // 4 tap.
+      // ((filter_index == 4) | (filter_index == 5))
+      FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
+                                              dst_stride, width, height, v_tap);
+    } else {  // 2 tap.
+      assert(filter_index == 3);
+      FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
+                                              dst_stride, width, height, v_tap);
+    }
+  }
+}
+
+void ConvolveHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* const src =
+      static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  DoHorizontalPass(src, src_stride, dest, dst_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* const src =
+      static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+
+  DoHorizontalPass</*is_compound=*/true>(src, src_stride, dest, width, width,
+                                         height, horizontal_filter_id,
+                                         filter_index);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const int16x4_t* const taps) {
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint16_t* src_x = src + x;
+    uint16x8_t srcs[8];
+    srcs[0] = vld1q_u16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vld1q_u16(src_x);
+      src_x += src_stride;
+      srcs[2] = vld1q_u16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vld1q_u16(src_x);
+        src_x += src_stride;
+        srcs[4] = vld1q_u16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vld1q_u16(src_x);
+          src_x += src_stride;
+          srcs[6] = vld1q_u16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
+    int y = 0;
+    do {
+      srcs[next_row] = vld1q_u16(src_x);
+      src_x += src_stride;
+
+      const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+      if (is_compound) {
+        const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(dst16 + x + y * dst_stride,
+                 vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+        vst1_u16(dst16 + x + 4 + y * dst_stride,
+                 vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+      } else {
+        const uint16x4_t d0 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+        const uint16x4_t d1 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+        vst1_u16(dst16 + x + y * dst_stride, d0);
+        vst1_u16(dst16 + x + 4 + y * dst_stride, d1);
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const int16x4_t* const taps) {
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  uint16x4_t srcs[9];
+  srcs[0] = vld1_u16(src);
+  src += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = vld1_u16(src);
+    src += src_stride;
+    srcs[2] = vld1_u16(src);
+    src += src_stride;
+    if (num_taps >= 6) {
+      srcs[3] = vld1_u16(src);
+      src += src_stride;
+      srcs[4] = vld1_u16(src);
+      src += src_stride;
+      if (num_taps == 8) {
+        srcs[5] = vld1_u16(src);
+        src += src_stride;
+        srcs[6] = vld1_u16(src);
+        src += src_stride;
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = vld1_u16(src);
+    src += src_stride;
+    srcs[num_taps] = vld1_u16(src);
+    src += src_stride;
+
+    const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+    const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps);
+    if (is_compound) {
+      const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+      const int16x4_t d1 =
+          vqrshrn_n_s32(v_sum_1, kInterRoundBitsHorizontal - 1);
+      vst1_u16(dst16,
+               vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+      dst16 += dst_stride;
+      vst1_u16(dst16,
+               vreinterpret_u16_s16(vadd_s16(d1, vdup_n_s16(kCompoundOffset))));
+      dst16 += dst_stride;
+    } else {
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      const uint16x4_t d1 =
+          vmin_u16(vqrshrun_n_s32(v_sum_1, kFilterBits - 1), v_max_bitdepth);
+      vst1_u16(dst16, d0);
+      dst16 += dst_stride;
+      vst1_u16(dst16, d1);
+      dst16 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps>
+void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const int16x4_t* const taps) {
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  const uint16x4_t v_zero = vdup_n_u16(0);
+
+  uint16x4_t srcs[9];
+  srcs[0] = Load2<0>(src, v_zero);
+  src += src_stride;
+  if (num_taps >= 4) {
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load2<0>(src, v_zero);
+    src += src_stride;
+    srcs[1] = vext_u16(srcs[0], srcs[2], 2);
+    if (num_taps >= 6) {
+      srcs[2] = Load2<1>(src, srcs[2]);
+      src += src_stride;
+      srcs[4] = Load2<0>(src, v_zero);
+      src += src_stride;
+      srcs[3] = vext_u16(srcs[2], srcs[4], 2);
+      if (num_taps == 8) {
+        srcs[4] = Load2<1>(src, srcs[4]);
+        src += src_stride;
+        srcs[6] = Load2<0>(src, v_zero);
+        src += src_stride;
+        srcs[5] = vext_u16(srcs[4], srcs[6], 2);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = Load2<1>(src, srcs[next_row - 1]);
+    src += src_stride;
+    srcs[num_taps] = Load2<0>(src, v_zero);
+    src += src_stride;
+    srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);
+
+    const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+    const uint16x4_t d0 =
+        vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+    Store2<0>(dst16, d0);
+    dst16 += dst_stride;
+    Store2<1>(dst16, d0);
+    dst16 += dst_stride;
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+                                  const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum_lo, sum_hi;
+  if (num_taps == 8) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+  }
+
+  if (is_compound) {
+    // Output is compound, so leave signed and do not saturate. Offset will
+    // accurately bring the value back into positive range.
+    return vcombine_s16(
+        vrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        vrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+  }
+
+  // Output is pixel, so saturate to clip at 0.
+  return vreinterpretq_s16_u16(
+      vcombine_u16(vqrshrun_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+                   vqrshrun_n_s32(sum_hi, kInterRoundBitsVertical - 1)));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth8AndUp(const int16_t* LIBGAV1_RESTRICT src,
+                                 void* LIBGAV1_RESTRICT const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    int16x8_t srcs[9];
+    srcs[0] = vld1q_s16(src);
+    src += 8;
+    if (num_taps >= 4) {
+      srcs[1] = vld1q_s16(src);
+      src += 8;
+      srcs[2] = vld1q_s16(src);
+      src += 8;
+      if (num_taps >= 6) {
+        srcs[3] = vld1q_s16(src);
+        src += 8;
+        srcs[4] = vld1q_s16(src);
+        src += 8;
+        if (num_taps == 8) {
+          srcs[5] = vld1q_s16(src);
+          src += 8;
+          srcs[6] = vld1q_s16(src);
+          src += 8;
+        }
+      }
+    }
+
+    uint16_t* d16 = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = vld1q_s16(src);
+      src += 8;
+      srcs[next_row + 1] = vld1q_s16(src);
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
+      if (is_compound) {
+        const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset);
+        vst1q_u16(d16,
+                  vreinterpretq_u16_s16(vaddq_s16(sum0, v_compound_offset)));
+        d16 += dst_stride;
+        vst1q_u16(d16,
+                  vreinterpretq_u16_s16(vaddq_s16(sum1, v_compound_offset)));
+        d16 += dst_stride;
+      } else {
+        vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum0), v_max_bitdepth));
+        d16 += dst_stride;
+        vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum1), v_max_bitdepth));
+        d16 += dst_stride;
+      }
+      srcs[0] = srcs[2];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
+          if (num_taps == 8) {
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
+          }
+        }
+      }
+      y -= 2;
+    } while (y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth4(const int16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vld1q_s16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = vld1q_s16(src);
+    src += 8;
+    srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+    if (num_taps >= 6) {
+      srcs[4] = vld1q_s16(src);
+      src += 8;
+      srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+      if (num_taps == 8) {
+        srcs[6] = vld1q_s16(src);
+        src += 8;
+        srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = vld1q_s16(src);
+    src += 8;
+    srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+                                      vget_low_s16(srcs[num_taps]));
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset);
+      vst1q_u16(dst16,
+                vreinterpretq_u16_s16(vaddq_s16(sum, v_compound_offset)));
+      dst16 += 4 << 1;
+    } else {
+      const uint16x8_t d0 =
+          vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth);
+      vst1_u16(dst16, vget_low_u16(d0));
+      dst16 += dst_stride;
+      vst1_u16(dst16, vget_high_u16(d0));
+      dst16 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVerticalWidth2(const int16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vld1q_s16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = vld1q_s16(src);
+    src += 8;
+    srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    if (num_taps == 8) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = vld1q_s16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    } else if (num_taps == 4) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    } else if (num_taps == 6) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+    } else if (num_taps == 8) {
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+      srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+      srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+    }
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const uint16x8_t d0 = vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth);
+    Store2<0>(dst16, d0);
+    dst16 += dst_stride;
+    Store2<1>(dst16, d0);
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst16 += dst_stride;
+    Store2<2>(dst16, d0);
+    dst16 += dst_stride;
+    Store2<3>(dst16, d0);
+    dst16 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y -= 4;
+  } while (y != 0);
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const int16_t* LIBGAV1_RESTRICT const intermediate_result,
+                      const int width, const int height, const int16x8_t taps,
+                      void* LIBGAV1_RESTRICT const prediction,
+                      const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  }
+}
+
+void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* LIBGAV1_RESTRICT const prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x43, sizeof(intermediate_result));
+#endif
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* const src = static_cast<const uint16_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+  const ptrdiff_t dest_stride = pred_stride >> 1;
+
+  DoHorizontalPass</*is_compound=*/false, /*is_2d=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  } else if (vertical_taps == 6) {
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  } else if (vertical_taps == 4) {
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  } else {  // |vertical_taps| == 2
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  }
+}
+
+template <int vertical_taps>
+void Compound2DVertical(
+    const int16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
+void ConvolveCompound2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t
+      intermediate_result[(kMaxSuperBlockSizeInPixels *
+                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1))];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* const src = static_cast<const uint16_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 6) {
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 4) {
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
+  } else {  // |vertical_taps| == 2
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
+  }
+}
+
+void ConvolveVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* src = static_cast<const uint16_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride >> 1;
+  assert(vertical_filter_id != 0);
+
+  int16x4_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 2) {
+      FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 3);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+    // below map to 4 tap filters.
+    assert(filter_index == 5 || filter_index == 4 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 0 || vertical_filter_id == 2 ||
+             vertical_filter_id == 3 || vertical_filter_id == 4 ||
+             vertical_filter_id == 5 || vertical_filter_id == 6 ||
+             vertical_filter_id == 10 || vertical_filter_id == 11 ||
+             vertical_filter_id == 12 || vertical_filter_id == 13 ||
+             vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* src = static_cast<const uint16_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  int16x4_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 4) {
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 3);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 3);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 || filter_index == 4 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const auto* src = static_cast<const uint16_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int final_shift =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+  const uint16x8_t offset =
+      vdupq_n_u16((1 << kBitdepth10) + (1 << (kBitdepth10 - 1)));
+
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      int w = width;
+      do {
+        const uint16x8_t v_src_lo = vld1q_u16(&src[x]);
+        const uint16x8_t v_src_hi = vld1q_u16(&src[x + 8]);
+        const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+        const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+        const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+        const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+        vst1q_u16(&dest[x], v_dest_lo);
+        vst1q_u16(&dest[x + 8], v_dest_hi);
+        x += 16;
+        w -= 16;
+      } while (w != 0);
+      src += src_stride;
+      dest += width;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x8_t v_src_lo = vld1q_u16(&src[0]);
+      const uint16x8_t v_src_hi = vld1q_u16(&src[src_stride]);
+      const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+      const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+      const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+      const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+      vst1q_u16(&dest[0], v_dest_lo);
+      vst1q_u16(&dest[8], v_dest_hi);
+      src += src_stride << 1;
+      dest += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {  // width == 4
+    int y = height;
+    do {
+      const uint16x4_t v_src_lo = vld1_u16(&src[0]);
+      const uint16x4_t v_src_hi = vld1_u16(&src[src_stride]);
+      const uint16x4_t v_sum_lo = vadd_u16(v_src_lo, vget_low_u16(offset));
+      const uint16x4_t v_sum_hi = vadd_u16(v_src_hi, vget_low_u16(offset));
+      const uint16x4_t v_dest_lo = vshl_n_u16(v_sum_lo, final_shift);
+      const uint16x4_t v_dest_hi = vshl_n_u16(v_sum_hi, final_shift);
+      vst1_u16(&dest[0], v_dest_lo);
+      vst1_u16(&dest[4], v_dest_hi);
+      src += src_stride << 1;
+      dest += 8;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+inline void HalfAddHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
+                              uint16_t* LIBGAV1_RESTRICT const dst) {
+  const uint16x8_t left = vld1q_u16(src);
+  const uint16x8_t right = vld1q_u16(src + 1);
+  vst1q_u16(dst, vrhaddq_u16(left, right));
+}
+
+inline void HalfAddHorizontal16(const uint16_t* LIBGAV1_RESTRICT const src,
+                                uint16_t* LIBGAV1_RESTRICT const dst) {
+  HalfAddHorizontal(src, dst);
+  HalfAddHorizontal(src + 8, dst + 8);
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint16_t* LIBGAV1_RESTRICT src,
+                                     const ptrdiff_t src_stride,
+                                     const int height,
+                                     uint16_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal16(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal16(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal16(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal16(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint16_t*>(reference);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      HalfAddHorizontal(src, dest);
+      src += src_stride;
+      dest += dst_stride;
+    } while (--y != 0);
+  } else {  // width == 4
+    int y = height;
+    do {
+      uint16x4x2_t left;
+      uint16x4x2_t right;
+      left.val[0] = vld1_u16(src);
+      right.val[0] = vld1_u16(src + 1);
+      src += src_stride;
+      left.val[1] = vld1_u16(src);
+      right.val[1] = vld1_u16(src + 1);
+      src += src_stride;
+
+      vst1_u16(dest, vrhadd_u16(left.val[0], right.val[0]));
+      dest += dst_stride;
+      vst1_u16(dest, vrhadd_u16(left.val[1], right.val[1]));
+      dest += dst_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint16_t* LIBGAV1_RESTRICT src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint16_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[8], below[8];
+
+  row[0] = vld1q_u16(src);
+  if (width >= 16) {
+    src += 8;
+    row[1] = vld1q_u16(src);
+    if (width >= 32) {
+      src += 8;
+      row[2] = vld1q_u16(src);
+      src += 8;
+      row[3] = vld1q_u16(src);
+      if (width == 64) {
+        src += 8;
+        row[4] = vld1q_u16(src);
+        src += 8;
+        row[5] = vld1q_u16(src);
+        src += 8;
+        row[6] = vld1q_u16(src);
+        src += 8;
+        row[7] = vld1q_u16(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = vld1q_u16(src);
+    if (width >= 16) {
+      src += 8;
+      below[1] = vld1q_u16(src);
+      if (width >= 32) {
+        src += 8;
+        below[2] = vld1q_u16(src);
+        src += 8;
+        below[3] = vld1q_u16(src);
+        if (width == 64) {
+          src += 8;
+          below[4] = vld1q_u16(src);
+          src += 8;
+          below[5] = vld1q_u16(src);
+          src += 8;
+          below[6] = vld1q_u16(src);
+          src += 8;
+          below[7] = vld1q_u16(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    vst1q_u16(dst, vrhaddq_u16(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 16) {
+      dst += 8;
+      vst1q_u16(dst, vrhaddq_u16(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 32) {
+        dst += 8;
+        vst1q_u16(dst, vrhaddq_u16(row[2], below[2]));
+        row[2] = below[2];
+        dst += 8;
+        vst1q_u16(dst, vrhaddq_u16(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 64) {
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[4], below[4]));
+          row[4] = below[4];
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[5], below[5]));
+          row[5] = below[5];
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[6], below[6]));
+          row[6] = below[6];
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint16_t*>(reference);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  if (width == 128) {
+    // Due to register pressure, process two 64xH.
+    for (int i = 0; i < 2; ++i) {
+      IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride);
+      src += 64;
+      dest += 64;
+    }
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 8) {
+    IntraBlockCopyVertical<8>(src, src_stride, height, dest, dst_stride);
+  } else {  // width == 4
+    uint16x4_t row = vld1_u16(src);
+    src += src_stride;
+    int y = height;
+    do {
+      const uint16x4_t below = vld1_u16(src);
+      src += src_stride;
+      vst1_u16(dest, vrhadd_u16(row, below));
+      dest += dst_stride;
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint16_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint16_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[16];
+  row[0] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+  if (width >= 16) {
+    src += 8;
+    row[1] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+    if (width >= 32) {
+      src += 8;
+      row[2] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+      src += 8;
+      row[3] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+      if (width >= 64) {
+        src += 8;
+        row[4] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        src += 8;
+        row[5] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        src += 8;
+        row[6] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        src += 8;
+        row[7] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        if (width == 128) {
+          src += 8;
+          row[8] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[9] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[10] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[11] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[12] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[13] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[14] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[15] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const uint16x8_t below_0 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+    vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[0], below_0), 2));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const uint16x8_t below_1 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+      vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[1], below_1), 2));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_2 =
+            vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[2], below_2), 2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_3 =
+            vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[3], below_3), 2));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_4 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[4], below_4), 2));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_5 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[5], below_5), 2));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_6 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[6], below_6), 2));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_7 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[7], below_7), 2));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_8 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[8], below_8), 2));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_9 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[9], below_9), 2));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_10 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[10], below_10), 2));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_11 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[11], below_11), 2));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_12 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[12], below_12), 2));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_13 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[13], below_13), 2));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_14 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[14], below_14), 2));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_15 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[15], below_15), 2));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint16_t*>(reference);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, src_stride, height, dest, dst_stride);
+  } else {  // width == 4
+    uint16x4_t row0 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+    src += src_stride;
+
+    int y = height;
+    do {
+      const uint16x4_t row1 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+      src += src_stride;
+      const uint16x4_t row2 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+      src += src_stride;
+      const uint16x4_t result_01 = vrshr_n_u16(vadd_u16(row0, row1), 2);
+      const uint16x4_t result_12 = vrshr_n_u16(vadd_u16(row1, row2), 2);
+      vst1_u16(dest, result_01);
+      dest += dst_stride;
+      vst1_u16(dest, result_12);
+      dest += dst_stride;
+      row0 = row2;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Scaled Convolve
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x16x3_t LoadSrcVals(const uint16_t* const src_x) {
+  uint8x16x3_t ret;
+  // When fractional step size is less than or equal to 1, the rightmost
+  // starting value for a filter may be at position 7. For an 8-tap filter, the
+  // rightmost value for the final tap may be at position 14. Therefore we load
+  // 2 vectors of eight 16-bit values.
+  ret.val[0] = vreinterpretq_u8_u16(vld1q_u16(src_x));
+  ret.val[1] = vreinterpretq_u8_u16(vld1q_u16(src_x + 8));
+#if LIBGAV1_MSAN
+  // Initialize to quiet msan warnings when grade_x <= 1.
+  ret.val[2] = vdupq_n_u8(0);
+#endif
+  if (grade_x > 1) {
+    // When fractional step size is greater than 1 (up to 2), the rightmost
+    // starting value for a filter may be at position 15. For an 8-tap filter,
+    // the rightmost value for the final tap may be at position 22. Therefore we
+    // load 3 vectors of eight 16-bit values.
+    ret.val[2] = vreinterpretq_u8_u16(vld1q_u16(src_x + 16));
+  }
+  return ret;
+}
+
+// Assemble 4 values corresponding to one tap position across multiple filters.
+// This is a simple case because maximum offset is 8 and only smaller filters
+// work on 4xH.
+inline uint16x4_t PermuteSrcVals(const uint8x16x3_t src_bytes,
+                                 const uint8x8_t indices) {
+  const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]};
+  return vreinterpret_u16_u8(VQTbl2U8(src_bytes2, indices));
+}
+
+// Assemble 8 values corresponding to one tap position across multiple filters.
+// This requires a lot of workaround on A32 architectures, so it may be worth
+// using an overall different algorithm for that architecture.
+template <int grade_x>
+inline uint16x8_t PermuteSrcVals(const uint8x16x3_t src_bytes,
+                                 const uint8x16_t indices) {
+  if (grade_x == 1) {
+    const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]};
+    return vreinterpretq_u16_u8(VQTbl2QU8(src_bytes2, indices));
+  }
+  return vreinterpretq_u16_u8(VQTbl3QU8(src_bytes, indices));
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+// Although the taps need to be converted to 16-bit values, they must be
+// arranged by table lookup, which is more expensive for larger types than
+// lengthening in-loop. |tap_index| refers to the index within a kernel applied
+// to a single value.
+inline int8x16_t GetPositive2TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr int8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+  return vld1q_s8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps in the spec.
+  const int kernel_offset = 3;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  const int8x16_t filter_taps0 = GetPositive2TapFilter(0);
+  const int8x16_t filter_taps1 = GetPositive2TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+  int p = subpixel_x;
+  if (width <= 4) {
+    const uint16_t* src_y = src;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+    // Each lane of lane of taps[k] corresponds to one output value along the
+    // row, containing kSubPixelFilters[filter_index][filter_id][k], where
+    // filter_id depends on x.
+    const int16x4_t taps[2] = {
+        vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+        vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices)))};
+    // Lower byte of Nth value is at position 2*N.
+    // Narrowing shift is not available here because the maximum shift
+    // parameter is 8.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    // Only 4 values needed.
+    const uint8x8_t src_indices = InterleaveLow8(src_indices0, src_indices1);
+    const uint8x8_t src_lookup[2] = {src_indices,
+                                     vadd_u8(src_indices, vdup_n_u8(2))};
+
+    int y = intermediate_height;
+    do {
+      const uint16_t* src_x =
+          src_y + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_x);
+      // Each lane corresponds to a different filter kernel.
+      const uint16x4_t src[2] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+                                 PermuteSrcVals(src_bytes, src_lookup[1])};
+
+      vst1_s16(intermediate,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps),
+                            kInterRoundBitsHorizontal - 1));
+      src_y = AddByteStride(src_y, src_stride);
+      intermediate += kIntermediateStride;
+    } while (--y != 0);
+    return;
+  }
+
+  // |width| >= 8
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  do {
+    const uint16_t* src_x =
+        src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // Each lane of lane of taps[k] corresponds to one output value along the
+    // row, containing kSubPixelFilters[filter_index][filter_id][k], where
+    // filter_id depends on x.
+    const int16x8_t taps[2] = {
+        vmovl_s8(VQTbl1S8(filter_taps0, filter_indices)),
+        vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))};
+    const int16x4_t taps_low[2] = {vget_low_s16(taps[0]),
+                                   vget_low_s16(taps[1])};
+    const int16x4_t taps_high[2] = {vget_high_s16(taps[0]),
+                                    vget_high_s16(taps[1])};
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+    const uint8x16_t src_lookup[2] = {src_indices,
+                                      vaddq_u8(src_indices, vdupq_n_u8(2))};
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+      // Each lane corresponds to a different filter kernel.
+      const uint16x8_t src[2] = {
+          PermuteSrcVals<grade_x>(src_bytes, src_lookup[0]),
+          PermuteSrcVals<grade_x>(src_bytes, src_lookup[1])};
+      const uint16x4_t src_low[2] = {vget_low_u16(src[0]),
+                                     vget_low_u16(src[1])};
+      const uint16x4_t src_high[2] = {vget_high_u16(src[0]),
+                                      vget_high_u16(src[1])};
+
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline int8x16_t GetPositive4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(
+      16) static constexpr int8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+  return vld1q_s8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalPositive4Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps in the spec.
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int8x16_t filter_taps0 = GetPositive4TapFilter(0);
+  const int8x16_t filter_taps1 = GetPositive4TapFilter(1);
+  const int8x16_t filter_taps2 = GetPositive4TapFilter(2);
+  const int8x16_t filter_taps3 = GetPositive4TapFilter(3);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+  int p = subpixel_x;
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint8x8_t filter_indices =
+      vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+  // Each lane of lane of taps[k] corresponds to one output value along the row,
+  // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id
+  // depends on x.
+  const int16x4_t taps[4] = {
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))};
+  // Lower byte of Nth value is at position 2*N.
+  // Narrowing shift is not available here because the maximum shift
+  // parameter is 8.
+  const uint8x8_t src_indices0 = vshl_n_u8(
+      vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+  // Upper byte of Nth value is at position 2*N+1.
+  const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+  // Only 4 values needed.
+  const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1);
+
+  uint8x8_t src_lookup[4];
+  const uint8x8_t two = vdup_n_u8(2);
+  src_lookup[0] = src_indices_base;
+  for (int i = 1; i < 4; ++i) {
+    src_lookup[i] = vadd_u8(src_lookup[i - 1], two);
+  }
+
+  const uint16_t* src_y =
+      src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped indices.
+    const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y);
+    // Each lane corresponds to a different filter kernel.
+    const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+                               PermuteSrcVals(src_bytes, src_lookup[1]),
+                               PermuteSrcVals(src_bytes, src_lookup[2]),
+                               PermuteSrcVals(src_bytes, src_lookup[3])};
+
+    vst1_s16(intermediate,
+             vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
+                          kInterRoundBitsHorizontal - 1));
+    src_y = AddByteStride(src_y, src_stride);
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline int8x16_t GetSigned4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+          {-0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {-0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+
+  return vld1q_s8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int8x16_t filter_taps0 = GetSigned4TapFilter(0);
+  const int8x16_t filter_taps1 = GetSigned4TapFilter(1);
+  const int8x16_t filter_taps2 = GetSigned4TapFilter(2);
+  const int8x16_t filter_taps3 = GetSigned4TapFilter(3);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  const int p = subpixel_x;
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint8x8_t filter_indices =
+      vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+  // Each lane of lane of taps[k] corresponds to one output value along the row,
+  // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id
+  // depends on x.
+  const int16x4_t taps[4] = {
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))};
+  // Lower byte of Nth value is at position 2*N.
+  // Narrowing shift is not available here because the maximum shift
+  // parameter is 8.
+  const uint8x8_t src_indices0 = vshl_n_u8(
+      vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+  // Upper byte of Nth value is at position 2*N+1.
+  const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+  // Only 4 values needed.
+  const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1);
+
+  uint8x8_t src_lookup[4];
+  const uint8x8_t two = vdup_n_u8(2);
+  src_lookup[0] = src_indices_base;
+  for (int i = 1; i < 4; ++i) {
+    src_lookup[i] = vadd_u8(src_lookup[i - 1], two);
+  }
+
+  const uint16_t* src_y =
+      src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped indices.
+    const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y);
+    // Each lane corresponds to a different filter kernel.
+    const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+                               PermuteSrcVals(src_bytes, src_lookup[1]),
+                               PermuteSrcVals(src_bytes, src_lookup[2]),
+                               PermuteSrcVals(src_bytes, src_lookup[3])};
+
+    vst1_s16(intermediate,
+             vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
+                          kInterRoundBitsHorizontal - 1));
+    src_y = AddByteStride(src_y, src_stride);
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline int8x16_t GetSigned6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+          {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+          {-0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {-0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+          {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  return vld1q_s8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  int8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetSigned6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint16_t* src_x =
+        src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+
+    // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+    // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+    // where filter_id depends on x.
+    int16x4_t taps_low[6];
+    int16x4_t taps_high[6];
+    for (int i = 0; i < 6; ++i) {
+      const int16x8_t taps_i =
+          vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+      taps_low[i] = vget_low_s16(taps_i);
+      taps_high[i] = vget_high_s16(taps_i);
+    }
+
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices_base =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+    uint8x16_t src_lookup[6];
+    const uint8x16_t two = vdupq_n_u8(2);
+    src_lookup[0] = src_indices_base;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+      uint16x4_t src_low[6];
+      uint16x4_t src_high[6];
+      for (int i = 0; i < 6; ++i) {
+        const uint16x8_t src_i =
+            PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+        src_low[i] = vget_low_u16(src_i);
+        src_high[i] = vget_high_u16(src_i);
+      }
+
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps depending on the filter id.
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel6TapMixedFilterColumns[6][16] = {
+          {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+          {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+  return vld1q_s8(kAbsHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  int8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetMixed6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint16_t* src_x =
+        src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+    // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+    // where filter_id depends on x.
+    int16x4_t taps_low[6];
+    int16x4_t taps_high[6];
+    for (int i = 0; i < 6; ++i) {
+      const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+      taps_low[i] = vget_low_s16(taps);
+      taps_high[i] = vget_high_s16(taps);
+    }
+
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices_base =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+    uint8x16_t src_lookup[6];
+    const uint8x16_t two = vdupq_n_u8(2);
+    src_lookup[0] = src_indices_base;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+      uint16x4_t src_low[6];
+      uint16x4_t src_high[6];
+      for (int i = 0; i < 6; ++i) {
+        const uint16x8_t src_i =
+            PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+        src_low[i] = vget_low_u16(src_i);
+        src_high[i] = vget_high_u16(src_i);
+      }
+
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline int8x16_t GetSigned8TapFilter(const int tap_index) {
+  assert(tap_index < 8);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+          {-0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -0},
+          {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+          {-0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3,
+           -1},
+          {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+          {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+          {-0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6,
+           -3},
+          {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+          {-0, -0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+
+  return vld1q_s8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  int8x16_t filter_taps[8];
+  for (int i = 0; i < 8; ++i) {
+    filter_taps[i] = GetSigned8TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint16_t* src_x = src + (p >> kScaleSubPixelBits) - ref_x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices_base =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+    uint8x16_t src_lookup[8];
+    const uint8x16_t two = vdupq_n_u8(2);
+    src_lookup[0] = src_indices_base;
+    for (int i = 1; i < 8; ++i) {
+      src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+    }
+    // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+    // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+    // where filter_id depends on x.
+    int16x4_t taps_low[8];
+    int16x4_t taps_high[8];
+    for (int i = 0; i < 8; ++i) {
+      const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+      taps_low[i] = vget_low_s16(taps);
+      taps_high[i] = vget_high_s16(taps);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+      uint16x4_t src_low[8];
+      uint16x4_t src_high[8];
+      for (int i = 0; i < 8; ++i) {
+        const uint16x8_t src_i =
+            PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+        src_low[i] = vget_low_u16(src_i);
+        src_high[i] = vget_high_u16(src_i);
+      }
+
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+                                    const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum;
+  if (num_taps == 8) {
+    sum = vmull_lane_s16(src[0], taps_lo, 0);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+    sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum = vmull_lane_s16(src[0], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum = vmull_lane_s16(src[0], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum = vmull_lane_s16(src[0], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+  }
+
+  return vreinterpret_s16_u16(vqrshrun_n_s32(sum, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale2Or4xH(const int16_t* LIBGAV1_RESTRICT const src,
+                                 const int subpixel_y, const int filter_index,
+                                 const int step_y, const int height,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t dest_stride) {
+  static_assert(width == 2 || width == 4, "");
+  // We increment stride with the 8-bit pointer and then reinterpret to avoid
+  // shifting |dest_stride|.
+  auto* dest_y = static_cast<uint16_t*>(dest);
+  // In compound mode, |dest_stride| is based on the size of uint16_t, rather
+  // than bytes.
+  auto* compound_dest_y = static_cast<uint16_t*>(dest);
+  // This stride always corresponds to int16_t.
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  const int16_t* src_y = src;
+  int16x4_t s[num_taps + grade_y];
+
+  int p = subpixel_y & 1023;
+  int prev_p = p;
+  int y = height;
+  do {
+    for (int i = 0; i < num_taps; ++i) {
+      s[i] = vld1_s16(src_y + i * src_stride);
+    }
+    int filter_id = (p >> 6) & kSubPixelMask;
+    int16x8_t filter =
+        vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+    if (is_compound) {
+      assert(width != 2);
+      // This offset potentially overflows into the sign bit, but should yield
+      // the correct unsigned value.
+      const uint16x4_t result =
+          vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset)));
+      vst1_u16(compound_dest_y, result);
+      compound_dest_y += dest_stride;
+    } else {
+      const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums),
+                                         vdup_n_u16((1 << kBitdepth10) - 1));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        vst1_u16(dest_y, result);
+      }
+      dest_y = AddByteStride(dest_y, dest_stride);
+    }
+    p += step_y;
+    const int p_diff =
+        (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+    prev_p = p;
+    // Here we load extra source in case it is needed. If |p_diff| == 0, these
+    // values will be unused, but it's faster to load than to branch.
+    s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+    if (grade_y > 1) {
+      s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+    }
+
+    filter_id = (p >> 6) & kSubPixelMask;
+    filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result =
+          vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset)));
+      vst1_u16(compound_dest_y, result);
+      compound_dest_y += dest_stride;
+    } else {
+      const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums),
+                                         vdup_n_u16((1 << kBitdepth10) - 1));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        vst1_u16(dest_y, result);
+      }
+      dest_y = AddByteStride(dest_y, dest_stride);
+    }
+    p += step_y;
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    prev_p = p;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source,
+                           const int intermediate_height, const int width,
+                           const int subpixel_y, const int filter_index,
+                           const int step_y, const int height,
+                           void* LIBGAV1_RESTRICT const dest,
+                           const ptrdiff_t dest_stride) {
+  // This stride always corresponds to int16_t.
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+
+  int16x8_t s[num_taps + 2];
+
+  const int16_t* src = source;
+  int x = 0;
+  do {
+    const int16_t* src_y = src;
+    int p = subpixel_y & 1023;
+    int prev_p = p;
+    // We increment stride with the 8-bit pointer and then reinterpret to avoid
+    // shifting |dest_stride|.
+    auto* dest_y = static_cast<uint16_t*>(dest) + x;
+    // In compound mode, |dest_stride| is based on the size of uint16_t, rather
+    // than bytes.
+    auto* compound_dest_y = static_cast<uint16_t*>(dest) + x;
+    int y = height;
+    do {
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = vld1q_s16(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      int16x8_t filter =
+          vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      int16x8_t sums =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+      if (is_compound) {
+        // This offset potentially overflows int16_t, but should yield the
+        // correct unsigned value.
+        const uint16x8_t result = vreinterpretq_u16_s16(
+            vaddq_s16(sums, vdupq_n_s16(kCompoundOffset)));
+        vst1q_u16(compound_dest_y, result);
+        compound_dest_y += dest_stride;
+      } else {
+        const uint16x8_t result = vminq_u16(
+            vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1));
+        vst1q_u16(dest_y, result);
+        dest_y = AddByteStride(dest_y, dest_stride);
+      }
+      p += step_y;
+      const int p_diff =
+          (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+      prev_p = p;
+      // Here we load extra source in case it is needed. If |p_diff| == 0, these
+      // values will be unused, but it's faster to load than to branch.
+      s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+      if (grade_y > 1) {
+        s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+      }
+
+      filter_id = (p >> 6) & kSubPixelMask;
+      filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      sums = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+      if (is_compound) {
+        assert(width != 2);
+        const uint16x8_t result = vreinterpretq_u16_s16(
+            vaddq_s16(sums, vdupq_n_s16(kCompoundOffset)));
+        vst1q_u16(compound_dest_y, result);
+        compound_dest_y += dest_stride;
+      } else {
+        const uint16x8_t result = vminq_u16(
+            vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1));
+        vst1q_u16(dest_y, result);
+        dest_y = AddByteStride(dest_y, dest_stride);
+      }
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+      prev_p = p;
+
+      y -= 2;
+    } while (y != 0);
+    src += kIntermediateStride * intermediate_height;
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index, const int subpixel_x,
+                          const int subpixel_y, const int step_x,
+                          const int step_y, const int width, const int height,
+                          void* LIBGAV1_RESTRICT const prediction,
+                          const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  assert(step_y <= 2048);
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+  int16_t intermediate_result[kIntermediateAllocWidth *
+                              (2 * kIntermediateAllocWidth + 8)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x54, sizeof(intermediate_result));
+#endif
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // The same applies to height and vertical filter index.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint16_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src = AddByteStride(src, vert_kernel_offset * src_stride);
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte (8-value) load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base subpel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // larger structure and use a larger table lookup in order to gather all
+  // filter inputs.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  // |num_taps| - 1 is the shuffle index of the final filter input.
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+
+  switch (filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned6Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned6Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+
+      } else {
+        ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned8Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned8Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      } else {
+        ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+                                         intermediate_height, intermediate);
+      break;
+    default:
+      assert(filter_index == 5);
+      ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+                                           intermediate_height, intermediate);
+  }
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  switch (filter_index) {
+    case 0:
+    case 1:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<6, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<6, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<6, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<6, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 2:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<8, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<8, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<8, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<8, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 3:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<2, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<2, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<2, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<2, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    default:
+      assert(filter_index == 4 || filter_index == 5);
+      assert(height <= 4);
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<4, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<4, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<4, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<4, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+  dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+}  // namespace
+
+void ConvolveInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
new file mode 100644 (file)
index 0000000..97b3f26
--- /dev/null
@@ -0,0 +1,3097 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index, bool negative_outside_taps = false>
+int16x8_t SumOnePassTaps(const uint8x8_t* const src,
+                         const uint8x8_t* const taps) {
+  uint16x8_t sum;
+  if (filter_index == 0) {
+    // 6 taps. + - + + - +
+    sum = vmull_u8(src[0], taps[0]);
+    // Unsigned overflow will result in a valid int16_t value.
+    sum = vmlsl_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlsl_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1 && negative_outside_taps) {
+    // 6 taps. - + + + + -
+    // Set a base we can subtract from.
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1) {
+    // 6 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps. - + - + + - + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlsl_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+    sum = vmlal_u8(sum, src[6], taps[6]);
+    sum = vmlsl_u8(sum, src[7], taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+  } else if (filter_index == 4) {
+    // 4 taps. - + + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlsl_u8(sum, src[3], taps[3]);
+  } else if (filter_index == 5) {
+    // 4 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+  }
+  return vreinterpretq_s16_u16(sum);
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* LIBGAV1_RESTRICT src,
+                                 const ptrdiff_t src_stride,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t pred_stride, const int width,
+                                 const int height,
+                                 const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  if (!is_2d) {
+    int y = height;
+    do {
+      int x = 0;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(src + x);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        if (is_compound) {
+          const uint16x8_t v_sum = vreinterpretq_u16_s16(
+              vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+          vst1q_u16(&dest16[x], v_sum);
+        } else {
+          // Normally the Horizontal pass does the downshift in two passes:
+          // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+          // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+          // Combining them requires adding the rounding offset from the skipped
+          // shift.
+          constexpr int first_shift_rounding_bit =
+              1 << (kInterRoundBitsHorizontal - 2);
+          sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+          const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+          vst1_u8(&dest8[x], result);
+        }
+        x += 8;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+  } else {
+    int x = 0;
+    do {
+      const uint8_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(s);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        const uint16x8_t v_sum = vreinterpretq_u16_s16(
+            vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+        vst1q_u16(dest16, v_sum);
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    uint8x8_t v_src[4];
+    int16x8_t sum;
+    v_src[0] = vld1_u8(src);
+    if (filter_index == 3) {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+      v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+    }
+    if (is_2d || is_compound) {
+      const uint16x4_t v_sum = vreinterpret_u16_s16(
+          vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+      vst1_u16(dest16, v_sum);
+    } else {
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+      StoreLo4(&dest8[0], result);
+    }
+    src += src_stride;
+    dest8 += pred_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height >> 1;
+  do {
+    const uint8x8_t input0 = vld1_u8(src);
+    const uint8x8_t input1 = vld1_u8(src + src_stride);
+    const uint8x8x2_t input = vzip_u8(input0, input1);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      // tap signs : + +
+      sum = vmull_u8(input.val[0], v_tap[3]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+    } else if (filter_index == 4) {
+      // tap signs : - + + -
+      sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    } else {
+      // tap signs : + + + +
+      sum = vmull_u8(input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    }
+    int16x8_t s = vreinterpretq_s16_u16(sum);
+    if (is_2d) {
+      const uint16x8_t v_sum =
+          vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vgetq_lane_u16(v_sum, 0);
+      dest16[1] = vgetq_lane_u16(v_sum, 2);
+      dest16 += pred_stride;
+      dest16[0] = vgetq_lane_u16(v_sum, 1);
+      dest16[1] = vgetq_lane_u16(v_sum, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+      dest8[0] = vget_lane_u8(result, 0);
+      dest8[1] = vget_lane_u8(result, 2);
+      dest8 += pred_stride;
+      dest8[0] = vget_lane_u8(result, 1);
+      dest8[1] = vget_lane_u8(result, 3);
+      dest8 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const uint8x8_t input = vld1_u8(src);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      sum = vmull_u8(input, v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+    } else if (filter_index == 4) {
+      sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlsl_u8(sum, input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    } else {
+      assert(filter_index == 5);
+      sum = vmull_u8(input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    }
+    // |sum| contains an int16_t value.
+    sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, sum);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const uint8x8_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+                                is_compound>(src, src_stride, dest, pred_stride,
+                                             width, height, v_tap);
+    return;
+  }
+
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
+    if (width == 2 && !is_compound) {
+      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+                                                  pred_stride, height, v_tap);
+      return;
+    }
+    assert(width == 4);
+    FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+        src, src_stride, dest, pred_stride, height, v_tap);
+  }
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+                                    const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum;
+  if (num_taps == 8) {
+    sum = vmull_lane_s16(src[0], taps_lo, 0);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+    sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum = vmull_lane_s16(src[0], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum = vmull_lane_s16(src[0], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum = vmull_lane_s16(src[0], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+  }
+
+  return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+                                  const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum_lo, sum_hi;
+  if (num_taps == 8) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vcombine_s16(
+        vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+                      vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+                                 void* LIBGAV1_RESTRICT const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    int16x8_t srcs[9];
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    if (num_taps >= 4) {
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      if (num_taps >= 6) {
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        if (num_taps == 8) {
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+        }
+      }
+    }
+
+    uint8_t* d8 = dst8 + x;
+    uint16_t* d16 = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
+      if (is_compound) {
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+        d16 += dst_stride;
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+        d16 += dst_stride;
+      } else {
+        vst1_u8(d8, vqmovun_s16(sum0));
+        d8 += dst_stride;
+        vst1_u8(d8, vqmovun_s16(sum1));
+        d8 += dst_stride;
+      }
+      srcs[0] = srcs[2];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
+          if (num_taps == 8) {
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
+          }
+        }
+      }
+      y -= 2;
+    } while (y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+    if (num_taps >= 6) {
+      srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+      if (num_taps == 8) {
+        srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+                                      vget_low_s16(srcs[num_taps]));
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      const uint16x8_t results = vreinterpretq_u16_s16(sum);
+      vst1q_u16(dst16, results);
+      dst16 += 4 << 1;
+    } else {
+      const uint8x8_t results = vqmovun_s16(sum);
+
+      StoreLo4(dst8, results);
+      dst8 += dst_stride;
+      StoreHi4(dst8, results);
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVerticalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    if (num_taps == 8) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    } else if (num_taps == 4) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    } else if (num_taps == 6) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+    } else if (num_taps == 8) {
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+      srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+      srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+    }
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const uint8x8_t results = vqmovun_s16(sum);
+
+    Store2<0>(dst8, results);
+    dst8 += dst_stride;
+    Store2<1>(dst8, results);
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2<2>(dst8, results);
+    dst8 += dst_stride;
+    Store2<3>(dst8, results);
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y += 4;
+  } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  // Duplicate the absolute value for each tap.  Negative taps are corrected
+  // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
+  uint8x8_t v_tap[kSubPixelTaps];
+  assert(filter_id != 0);
+
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+  }
+
+  if (filter_index == 2) {  // 8 tap.
+    FilterHorizontal<2, true, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    // Check if outside taps are positive.
+    if ((filter_id == 1) | (filter_id == 15)) {
+      FilterHorizontal<1, false, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+    } else {
+      FilterHorizontal<1, true, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+    }
+  } else if (filter_index == 0) {  // 6 tap.
+    FilterHorizontal<0, true, is_2d, is_compound>(
+        src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    FilterHorizontal<4, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<5, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    FilterHorizontal<3, true, is_2d, is_compound>(
+        src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(
+    const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  }
+}
+
+void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* LIBGAV1_RESTRICT const prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  } else if (vertical_taps == 6) {
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  } else if (vertical_taps == 4) {
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  } else {  // |vertical_taps| == 2
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
+  uint8x8x3_t ret;
+  const uint8x16_t src_val = vld1q_u8(src_x);
+  ret.val[0] = vget_low_u8(src_val);
+  ret.val[1] = vget_high_u8(src_val);
+#if LIBGAV1_MSAN
+  // Initialize to quiet msan warnings when grade_x <= 1.
+  ret.val[2] = vdup_n_u8(0);
+#endif
+  if (grade_x > 1) {
+    ret.val[2] = vld1_u8(src_x + 16);
+  }
+  return ret;
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+  return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = 3;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
+  const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+  int p = subpixel_x;
+  if (width <= 4) {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+    // This is a special case. The 2-tap filter has no negative taps, so we
+    // can use unsigned values.
+    // For each x, a lane of tapsK has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16_t src_vals = vld1q_u8(src_x);
+      const uint8x8_t src_indices =
+          vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+      // For each x, a lane of srcK contains src_x[k].
+      const uint8x8_t src[2] = {
+          VQTbl1U8(src_vals, src_indices),
+          VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+      vst1q_s16(intermediate,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate += kIntermediateStride;
+    } while (--y != 0);
+    return;
+  }
+
+  // |width| >= 8
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // This is a special case. The 2-tap filter has no negative taps, so we
+    // can use unsigned values.
+    // For each x, a lane of tapsK has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+      const uint8x8_t src_indices =
+          vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+      // For each x, a lane of srcK contains src_x[k].
+      const uint8x8_t src[2] = {
+          vtbl3_u8(src_vals, src_indices),
+          vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+      vst1q_s16(intermediate,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+  return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+void ConvolveKernelHorizontalPositive4Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
+  const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
+  const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
+  const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const int p = subpixel_x;
+  // First filter is special, just a 128 tap on the center.
+  const uint8_t* src_x =
+      &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint8x8_t filter_indices = vand_u8(
+      vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
+  // Note that filter_id depends on x.
+  // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
+
+  const uint8x8_t src_indices =
+      vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped index vectors.
+    const uint8x16_t src_vals = vld1q_u8(src_x);
+
+    // For each x, srcK contains src_x[k] where k=1.
+    // Whereas taps come from different arrays, src pixels are drawn from the
+    // same contiguous line.
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
+
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
+
+    src_x += src_stride;
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+          {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
+
+  return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
+  const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
+  const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
+  const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
+  const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
+                                            static_cast<uint16_t>(step_x));
+
+  const int p = subpixel_x;
+  const uint8_t* src_x =
+      &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
+  const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
+  const uint8x8_t filter_index_offsets = vshrn_n_u16(
+      vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
+  const uint8x8_t filter_indices =
+      vand_u8(filter_index_offsets, filter_index_mask);
+  // Note that filter_id depends on x.
+  // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
+
+  const uint8x8_t src_indices_base =
+      vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
+
+  const uint8x8_t src_indices[4] = {src_indices_base,
+                                    vadd_u8(src_indices_base, vdup_n_u8(1)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(2)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(3))};
+
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped indices.
+    const uint8x16_t src_vals = vld1q_u8(src_x);
+
+    // For each x, srcK contains src_x[k] where k=1.
+    // Whereas taps come from different arrays, src pixels are drawn from the
+    // same contiguous line.
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
+        VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
+
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
+    src_x += src_stride;
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+          {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+          {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
+          {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetSigned6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    // Avoid overloading outside the reference boundaries. This means
+    // |trailing_width| can be up to 24.
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    uint8x8_t taps[6];
+    for (int i = 0; i < 6; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[6] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps which are handled in
+// GetMixed6TapFilter().
+inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
+}
+
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
+      {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+  return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[4];
+  int16x8_t mixed_taps[2];
+  uint8x16_t positive_filter_taps[4];
+  for (int i = 0; i < 4; ++i) {
+    positive_filter_taps[i] = GetPositive6TapFilter(i);
+  }
+  int8x16_t mixed_filter_taps[2];
+  mixed_filter_taps[0] = GetMixed6TapFilter(0);
+  mixed_filter_taps[1] = GetMixed6TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 4; ++i) {
+      taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
+    }
+    mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
+    mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      int16x8_t sum_mixed = vmulq_s16(
+          mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
+      sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
+                            ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
+      uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
+      sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
+      sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
+      sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
+      sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
+
+      vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
+  assert(tap_index < 8);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+          {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
+          {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+          {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
+          {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+          {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+          {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
+          {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+          {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[8];
+  uint8x16_t filter_taps[8];
+  for (int i = 0; i < 8; ++i) {
+    filter_taps[i] = GetSigned8TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[8];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 8; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 8; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[8] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
+          vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// This function handles blocks of width 2 or 4.
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale4xH(const int16_t* LIBGAV1_RESTRICT const src,
+                              const int subpixel_y, const int filter_index,
+                              const int step_y, const int height,
+                              void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  auto* dest16_y = static_cast<uint16_t*>(dest);
+  auto* dest_y = static_cast<uint8_t*>(dest);
+  int16x4_t s[num_taps + grade_y];
+
+  int p = subpixel_y & 1023;
+  int prev_p = p;
+  int y = height;
+  do {
+    for (int i = 0; i < num_taps; ++i) {
+      s[i] = vld1_s16(src_y + i * src_stride);
+    }
+    int filter_id = (p >> 6) & kSubPixelMask;
+    int16x8_t filter =
+        vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    const int p_diff =
+        (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+    prev_p = p;
+    // Here we load extra source in case it is needed. If |p_diff| == 0, these
+    // values will be unused, but it's faster to load than to branch.
+    s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+    if (grade_y > 1) {
+      s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+    }
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+
+    filter_id = (p >> 6) & kSubPixelMask;
+    filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    prev_p = p;
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source,
+                                  const int intermediate_height,
+                                  const int width, const int subpixel_y,
+                                  const int filter_index, const int step_y,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
+                                  const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  // A possible improvement is to use arithmetic to decide how many times to
+  // apply filters to same source before checking whether to load new srcs.
+  // However, this will only improve performance with very small step sizes.
+  int16x8_t s[num_taps + grade_y];
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  uint16_t* dest16_y;
+  uint8_t* dest_y;
+  const int16_t* src = source;
+
+  int x = 0;
+  do {
+    const int16_t* src_y = src;
+    dest16_y = static_cast<uint16_t*>(dest) + x;
+    dest_y = static_cast<uint8_t*>(dest) + x;
+    int p = subpixel_y & 1023;
+    int prev_p = p;
+    int y = height;
+    do {
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = vld1q_s16(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      int16x8_t filter =
+          vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      const int p_diff =
+          (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+      // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
+      // needed. Otherwise, we only need to load one vector because |p_diff|
+      // can't exceed 1.
+      s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+      if (grade_y > 1) {
+        s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+      }
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+
+      filter_id = (p >> 6) & kSubPixelMask;
+      filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+      prev_p = p;
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+      y -= 2;
+    } while (y != 0);
+    src += kIntermediateStride * intermediate_height;
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index, const int subpixel_x,
+                          const int subpixel_y, const int step_x,
+                          const int step_y, const int width, const int height,
+                          void* LIBGAV1_RESTRICT const prediction,
+                          const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  assert(step_y <= 2048);
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kIntermediateAllocWidth *
+                              (2 * kIntermediateAllocWidth + 8)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x44, sizeof(intermediate_result));
+#endif
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base subpel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // larger structure and use a larger table lookup in order to gather all
+  // filter inputs.
+  // |num_taps| - 1 is the shuffle index of the final filter input.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned6Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned6Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+
+      } else {
+        ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned8Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned8Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      } else {
+        ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+                                         intermediate_height, intermediate);
+      break;
+    default:
+      assert(filter_index == 5);
+      ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+                                           intermediate_height, intermediate);
+  }
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+
+  switch (filter_index) {
+    case 0:
+    case 1:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 2:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 3:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 4:
+    default:
+      assert(filter_index == 4 || filter_index == 5);
+      assert(height <= 4);
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+  }
+}
+
+void ConvolveHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint8_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+uint16x8_t Compound1DShift(const int16x8_t sum) {
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    uint8x8_t srcs[8];
+    srcs[0] = vld1_u8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vld1_u8(src_x);
+      src_x += src_stride;
+      srcs[2] = vld1_u8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vld1_u8(src_x);
+        src_x += src_stride;
+        srcs[4] = vld1_u8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vld1_u8(src_x);
+          src_x += src_stride;
+          srcs[6] = vld1_u8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
+    int y = 0;
+    do {
+      srcs[next_row] = vld1_u8(src_x);
+      src_x += src_stride;
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+        vst1q_u16(dst16 + x + y * dst_stride, results);
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+        vst1_u8(dst8 + x + y * dst_stride, results);
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
+void FilterVertical4xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      srcs[0] = Load4<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load4<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+    int y = height;
+    do {
+      srcs[2] = Load4<1>(src, srcs[2]);
+      src += src_stride;
+      srcs[4] = Load4<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    srcs[6] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+    int y = height;
+    do {
+      srcs[4] = Load4<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[6] = Load4<0>(src, srcs[6]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+    srcs[4] = Load4<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[6] = Load4(src);
+    src += src_stride;
+    srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+    int y = height;
+    do {
+      srcs[6] = Load4<1>(src, srcs[6]);
+      src += src_stride;
+      srcs[8] = Load4<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[6], srcs[8], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 2);
+
+      // This uses srcs[0]..srcs[1].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+      // This uses srcs[0]..srcs[3].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+
+      // This uses srcs[0]..srcs[5].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      srcs[6] = vext_u8(srcs[4], srcs[8], 4);
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[4], srcs[8], 6);
+
+      // This uses srcs[0]..srcs[7].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  }
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+void ConvolveVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 5 tap.
+    if (width == 2) {
+      FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9))) !=
+             0) {  // 6 tap with weird negative taps.
+    if (width == 2) {
+      FilterVertical2xH<1,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/false,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    // Outside taps are negative.
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+    // below map to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int final_shift =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const uint8x16_t v_src = vld1q_u8(&src[x]);
+        const uint16x8_t v_dest_lo =
+            vshll_n_u8(vget_low_u8(v_src), final_shift);
+        const uint16x8_t v_dest_hi =
+            vshll_n_u8(vget_high_u8(v_src), final_shift);
+        vst1q_u16(&dest[x], v_dest_lo);
+        x += 8;
+        vst1q_u16(&dest[x], v_dest_hi);
+        x += 8;
+      } while (x < width);
+      src += src_stride;
+      dest += width;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
+      src += src_stride;
+      dest += width;
+    } while (--y != 0);
+  } else {  // width == 4
+    uint8x8_t v_src = vdup_n_u8(0);
+
+    int y = height;
+    do {
+      v_src = Load4<0>(&src[0], v_src);
+      src += src_stride;
+      v_src = Load4<1>(&src[0], v_src);
+      src += src_stride;
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
+      dest += 4 << 1;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void ConvolveCompoundVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 5 tap.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9))) !=
+             0) {  // 6 tap with weird negative taps.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true,
+                        /*negative_outside_taps=*/true>(src, src_stride, dest,
+                                                        4, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, width, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 3);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+template <int vertical_taps>
+void Compound2DVertical(
+    const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
+void ConvolveCompound2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 6) {
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 4) {
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
+  } else {  // |vertical_taps| == 2
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+                              uint8_t* LIBGAV1_RESTRICT const dst) {
+  const uint8x16_t left = vld1q_u8(src);
+  const uint8x16_t right = vld1q_u8(src + 1);
+  vst1q_u8(dst, vrhaddq_u8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                                     const ptrdiff_t src_stride,
+                                     const int height,
+                                     uint8_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint8x8_t left = vld1_u8(src);
+      const uint8x8_t right = vld1_u8(src + 1);
+      vst1_u8(dest, vrhadd_u8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else {  // width == 4
+    uint8x8_t left = vdup_n_u8(0);
+    uint8x8_t right = vdup_n_u8(0);
+    int y = height;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint8x8_t result = vrhadd_u8(left, right);
+
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  uint8x16_t row[8], below[8];
+
+  row[0] = vld1q_u8(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = vld1q_u8(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = vld1q_u8(src);
+      src += 16;
+      row[3] = vld1q_u8(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = vld1q_u8(src);
+        src += 16;
+        row[5] = vld1q_u8(src);
+        src += 16;
+        row[6] = vld1q_u8(src);
+        src += 16;
+        row[7] = vld1q_u8(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = vld1q_u8(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = vld1q_u8(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = vld1q_u8(src);
+        src += 16;
+        below[3] = vld1q_u8(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = vld1q_u8(src);
+          src += 16;
+          below[5] = vld1q_u8(src);
+          src += 16;
+          below[6] = vld1q_u8(src);
+          src += 16;
+          below[7] = vld1q_u8(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    uint8x8_t row, below;
+    row = vld1_u8(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = vld1_u8(src);
+      src += reference_stride;
+
+      vst1_u8(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else {  // width == 4
+    uint8x8_t row = Load4(src);
+    uint8x8_t below = vdup_n_u8(0);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = Load4<0>(src, below);
+      src += reference_stride;
+
+      StoreLo4(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint8_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[16];
+  row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+  if (width >= 16) {
+    src += 8;
+    row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    if (width >= 32) {
+      src += 8;
+      row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      src += 8;
+      row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      if (width >= 64) {
+        src += 8;
+        row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        if (width == 128) {
+          src += 8;
+          row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_10 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_11 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_12 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_13 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_14 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_15 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else {  // width == 4
+    uint8x8_t left = Load4(src);
+    uint8x8_t right = Load4(src + 1);
+    src += reference_stride;
+
+    uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+    int y = height;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint16x8_t below = vaddl_u8(left, right);
+
+      const uint8x8_t result = vrshrn_n_u16(
+          vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+
+      row = vget_high_u16(below);
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+  dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/convolve_neon.h b/src/dsp/arm/convolve_neon.h
new file mode 100644 (file)
index 0000000..9c67bc9
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve. This function is not thread-safe.
+void ConvolveInit_NEON();
+void ConvolveInit10bpp_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
new file mode 100644 (file)
index 0000000..6087276
--- /dev/null
@@ -0,0 +1,334 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+constexpr int kInterPostRoundBit = 4;
+
+namespace low_bitdepth {
+namespace {
+
+inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+                                         const int16x8_t pred1,
+                                         const int16x8_t weight) {
+  // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+  // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+  //    8(=kInterPostRoundBit + 4)
+  // The formula is manipulated to avoid lengthening to 32 bits.
+  // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+  // = (p0 - p1) * w0 + 16 * p1
+  // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+  const int16x8_t diff = vsubq_s16(pred0, pred1);
+  // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4)
+  const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight);
+  // ((p0 - p1) * w0 >> 4) + p1
+  const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1);
+  // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+  return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit);
+}
+
+template <int width>
+inline void DistanceWeightedBlendSmall_NEON(
+    const int16_t* LIBGAV1_RESTRICT prediction_0,
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int height,
+    const int16x8_t weight, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  constexpr int step = 16 / width;
+
+  int y = height;
+  do {
+    const int16x8_t src_00 = vld1q_s16(prediction_0);
+    const int16x8_t src_10 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight);
+
+    const int16x8_t src_01 = vld1q_s16(prediction_0);
+    const int16x8_t src_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight);
+
+    if (width == 4) {
+      StoreLo4(dst, result0);
+      dst += dest_stride;
+      StoreHi4(dst, result0);
+      dst += dest_stride;
+      StoreLo4(dst, result1);
+      dst += dest_stride;
+      StoreHi4(dst, result1);
+      dst += dest_stride;
+    } else {
+      assert(width == 8);
+      vst1_u8(dst, result0);
+      dst += dest_stride;
+      vst1_u8(dst, result1);
+      dst += dest_stride;
+    }
+    y -= step;
+  } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_NEON(
+    const int16_t* LIBGAV1_RESTRICT prediction_0,
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight,
+    const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
+      const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
+      const uint8x8_t res_lo =
+          ComputeWeightedAverage8(src0_lo, src1_lo, weight);
+
+      const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
+      const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
+      const uint8x8_t res_hi =
+          ComputeWeightedAverage8(src0_hi, src1_hi, weight);
+
+      const uint8x16_t result = vcombine_u8(res_lo, res_hi);
+      vst1q_u8(dst + x, result);
+      x += 16;
+    } while (x < width);
+    dst += dest_stride;
+    prediction_0 += width;
+    prediction_1 += width;
+  } while (--y != 0);
+}
+
+inline void DistanceWeightedBlend_NEON(
+    const void* LIBGAV1_RESTRICT prediction_0,
+    const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0,
+    const uint8_t /*weight_1*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  // Upscale the weight for vqdmulh.
+  const int16x8_t weight = vdupq_n_s16(weight_0 << 11);
+  if (width == 4) {
+    DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest,
+                                       dest_stride);
+    return;
+  }
+
+  if (width == 8) {
+    DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest,
+                                       dest_stride);
+    return;
+  }
+
+  DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest,
+                                  dest_stride);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+                                            const uint16x4x2_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+  const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+  const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x2_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+  return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+                                            const uint16x4x4_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+  const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+  const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+  const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+  const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+  const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+  const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+  const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+  const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+  const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x4_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+  result.val[2] =
+      vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+  result.val[3] =
+      vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+  return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+  uint16x4x2_t x;
+  // gcc/clang (64 bit) optimizes the following to ldp.
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+  uint16x4x4_t x;
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  x.val[2] = vld1_u16(ptr + 8);
+  x.val[3] = vld1_u16(ptr + 12);
+  return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                const uint8_t weight_0, const uint8_t weight_1,
+                                const int width, const int height,
+                                void* LIBGAV1_RESTRICT const dest,
+                                const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
+
+  if (width == 4) {
+    int y = height;
+    do {
+      const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+      const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+      const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + dst_stride, res.val[1]);
+      dst += dst_stride << 1;
+      pred_0 += 8;
+      pred_1 += 8;
+      y -= 2;
+    } while (y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+      const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+      const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + 4, res.val[1]);
+      vst1_u16(dst + dst_stride, res.val[2]);
+      vst1_u16(dst + dst_stride + 4, res.val[3]);
+      dst += dst_stride << 1;
+      pred_0 += 16;
+      pred_1 += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+        const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+        const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+        vst1_u16(dst + x, res.val[0]);
+        vst1_u16(dst + x + 4, res.val[1]);
+        vst1_u16(dst + x + 8, res.val[2]);
+        vst1_u16(dst + x + 12, res.val[3]);
+        x += 16;
+      } while (x < width);
+      dst += dst_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
new file mode 100644 (file)
index 0000000..94a799c
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If NEON is enabled signal the NEON implementation should be used instead of
+// normal C.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
new file mode 100644 (file)
index 0000000..cde887c
--- /dev/null
@@ -0,0 +1,1501 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// These functions are overloaded for both possible sizes in order to simplify
+// loading and storing to and from intermediate value types from within a
+// template function.
+inline int16x8_t GetSignedSource8(const int8_t* src) {
+  return vmovl_s8(vld1_s8(src));
+}
+
+inline int16x8_t GetSignedSource8(const uint8_t* src) {
+  return ZeroExtend(vld1_u8(src));
+}
+
+inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) {
+  return ZeroExtend(Load1MsanU8(src, 8 - valid_range));
+}
+
+inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
+  vst1_u8(dest, vmovn_u16(data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
+
+inline int16x8_t GetSignedSource8(const uint16_t* src) {
+  return vreinterpretq_s16_u16(vld1q_u16(src));
+}
+
+inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) {
+  return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range));
+}
+
+inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
+  vst1q_u16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Each element in |sum| represents one destination value's running
+// autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
+// allow for a sliding window in successive calls to this function.
+template <int position_offset>
+inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
+                                           const int16x8_t grain_hi,
+                                           int16_t coeff, int32x4x2_t sum) {
+  const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
+  return sum;
+}
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int8_t* LIBGAV1_RESTRICT grain_cursor,
+                                     int32x4x2_t sum,
+                                     const int8_t* LIBGAV1_RESTRICT coeffs,
+                                     int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int16_t* LIBGAV1_RESTRICT grain_cursor,
+                                     int32x4x2_t sum,
+                                     const int8_t* LIBGAV1_RESTRICT coeffs,
+                                     int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(
+    int8_t* LIBGAV1_RESTRICT u_grain_cursor,
+    int8_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+    int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+    const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(
+    int16_t* LIBGAV1_RESTRICT u_grain_cursor,
+    int16_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+    int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+    const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline void SetZero(int32x4x2_t* v) {
+  v->val[0] = vdupq_n_s32(0);
+  v->val[1] = vdupq_n_s32(0);
+}
+
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    const int8x16_t src0 = vld1q_s8(luma);
+    const int8x16_t src1 = vld1q_s8(luma + stride);
+    const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
+                                        vpaddl_s8(vget_high_s8(src0)));
+    const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
+                                        vpaddl_s8(vget_high_s8(src1)));
+    return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int8x16_t src = vld1q_s8(luma);
+    return vrshrq_n_s16(
+        vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
+        1);
+  }
+  return vmovl_s8(vld1_s8(luma));
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const uint8x16_t src = vld1q_u8(luma);
+    return vrshrq_n_u16(vpaddlq_u8(src), 1);
+  }
+  return vmovl_u8(vld1_u8(luma));
+}
+
+inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
+                                     int subsampling_x, int valid_range) {
+  if (subsampling_x != 0) {
+    const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range);
+    // MemorySanitizer registers vpaddlq_u8 as a use of the memory.
+    return vrshrq_n_u16(vpaddlq_u8(src), 1);
+  }
+  return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    int16x8_t src0_lo = vld1q_s16(luma);
+    int16x8_t src0_hi = vld1q_s16(luma + 8);
+    const int16x8_t src1_lo = vld1q_s16(luma + stride);
+    const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
+    const int16x8_t src0 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
+                     vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
+    const int16x8_t src1 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
+                     vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
+    return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int16x8_t src_lo = vld1q_s16(luma);
+    const int16x8_t src_hi = vld1q_s16(luma + 8);
+    const int16x8_t ret =
+        vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
+                     vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
+    return vrshrq_n_s16(ret, 1);
+  }
+  return vld1q_s16(luma);
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
+                                 int subsampling_x) {
+  if (subsampling_x != 0) {
+    const uint16x8x2_t src = vld2q_u16(luma);
+    return vrhaddq_u16(src.val[0], src.val[1]);
+  }
+  return vld1q_u16(luma);
+}
+
+inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
+                                     int subsampling_x, int valid_range) {
+  if (subsampling_x != 0) {
+    const uint16x8x2_t src = vld2q_u16(luma);
+    const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]);
+    return MaskOverreadsQ(result, 16 - valid_range);
+  }
+  return Load1QMsanU16(luma, 16 - valid_range);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+          bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_NEON(
+    const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+    int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+    void* LIBGAV1_RESTRICT v_grain_buffer) {
+  static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
+  const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+  auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+  auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+  const int auto_regression_shift = params.auto_regression_shift;
+  const int chroma_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int chroma_height =
+      (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+  // When |chroma_width| == 44, we write 8 at a time from x in [3, 34],
+  // leaving [35, 40] to write at the end.
+  const int chroma_width_remainder =
+      (chroma_width - 2 * kAutoRegressionBorder) & 7;
+
+  int y = kAutoRegressionBorder;
+  luma_grain += kLumaWidth * y;
+  u_grain += chroma_width * y;
+  v_grain += chroma_width * y;
+  do {
+    // Each row is computed 8 values at a time in the following loop. At the
+    // end of the loop, 4 values remain to write. They are given a special
+    // reduced iteration at the end.
+    int x = kAutoRegressionBorder;
+    int luma_x = kAutoRegressionBorder;
+    do {
+      int pos = 0;
+      int32x4x2_t sum_u;
+      int32x4x2_t sum_v;
+      SetZero(&sum_u);
+      SetZero(&sum_v);
+
+      if (auto_regression_coeff_lag > 0) {
+        for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+             ++delta_row) {
+          // These loads may overflow to the next row, but they are never called
+          // on the final row of a grain block. Therefore, they will never
+          // exceed the block boundaries.
+          // Note: this could be slightly optimized to a single load in 8bpp,
+          // but requires making a special first iteration and accumulate
+          // function that takes an int8x16_t.
+          const int16x8_t u_grain_lo =
+              GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag);
+          const int16x8_t u_grain_hi =
+              GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag + 8);
+          const int16x8_t v_grain_lo =
+              GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag);
+          const int16x8_t v_grain_hi =
+              GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag + 8);
+#define ACCUMULATE_WEIGHTED_GRAIN(offset)                                  \
+  sum_u = AccumulateWeightedGrain<offset>(                                 \
+      u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \
+  sum_v = AccumulateWeightedGrain<offset>(                                 \
+      v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v)
+
+          ACCUMULATE_WEIGHTED_GRAIN(0);
+          ACCUMULATE_WEIGHTED_GRAIN(1);
+          ACCUMULATE_WEIGHTED_GRAIN(2);
+          // The horizontal |auto_regression_coeff_lag| loop is replaced with
+          // if-statements to give vextq_s16 an immediate param.
+          if (auto_regression_coeff_lag > 1) {
+            ACCUMULATE_WEIGHTED_GRAIN(3);
+            ACCUMULATE_WEIGHTED_GRAIN(4);
+          }
+          if (auto_regression_coeff_lag > 2) {
+            assert(auto_regression_coeff_lag == 3);
+            ACCUMULATE_WEIGHTED_GRAIN(5);
+            ACCUMULATE_WEIGHTED_GRAIN(6);
+          }
+        }
+      }
+
+      if (use_luma) {
+        const int16x8_t luma = GetSubsampledLuma(
+            luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+        // Luma samples get the final coefficient in the formula, but are best
+        // computed all at once before the final row.
+        const int coeff_u =
+            params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+        const int coeff_v =
+            params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+        sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+        sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+        sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+        sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+      }
+      // At this point in the filter, the source addresses and destination
+      // addresses overlap. Because this is an auto-regressive filter, the
+      // higher lanes cannot be computed without the results of the lower lanes.
+      // Each call to WriteFinalAutoRegression incorporates preceding values
+      // on the final row, and writes a single sample. This allows the next
+      // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane)                                    \
+  WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>(  \
+      u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \
+      params.auto_regression_coeff_v, pos, auto_regression_shift)
+
+      WRITE_AUTO_REGRESSION_RESULT(0);
+      WRITE_AUTO_REGRESSION_RESULT(1);
+      WRITE_AUTO_REGRESSION_RESULT(2);
+      WRITE_AUTO_REGRESSION_RESULT(3);
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+      WRITE_AUTO_REGRESSION_RESULT(6);
+      WRITE_AUTO_REGRESSION_RESULT(7);
+
+      x += 8;
+      luma_x += 8 << subsampling_x;
+    } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder);
+
+    // This is the "final iteration" of the above loop over width. We fill in
+    // the remainder of the width, which is less than 8.
+    int pos = 0;
+    int32x4x2_t sum_u;
+    int32x4x2_t sum_v;
+    SetZero(&sum_u);
+    SetZero(&sum_v);
+
+    for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+         ++delta_row) {
+      // These loads may overflow to the next row, but they are never called on
+      // the final row of a grain block. Therefore, they will never exceed the
+      // block boundaries.
+      const int16x8_t u_grain_lo = GetSignedSource8(
+          u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+      const int16x8_t u_grain_hi =
+          GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                           auto_regression_coeff_lag + 8);
+      const int16x8_t v_grain_lo = GetSignedSource8(
+          v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+      const int16x8_t v_grain_hi =
+          GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                           auto_regression_coeff_lag + 8);
+
+      ACCUMULATE_WEIGHTED_GRAIN(0);
+      ACCUMULATE_WEIGHTED_GRAIN(1);
+      ACCUMULATE_WEIGHTED_GRAIN(2);
+      // The horizontal |auto_regression_coeff_lag| loop is replaced with
+      // if-statements to give vextq_s16 an immediate param.
+      if (auto_regression_coeff_lag > 1) {
+        ACCUMULATE_WEIGHTED_GRAIN(3);
+        ACCUMULATE_WEIGHTED_GRAIN(4);
+      }
+      if (auto_regression_coeff_lag > 2) {
+        assert(auto_regression_coeff_lag == 3);
+        ACCUMULATE_WEIGHTED_GRAIN(5);
+        ACCUMULATE_WEIGHTED_GRAIN(6);
+      }
+    }
+
+    if (use_luma) {
+      const int16x8_t luma = GetSubsampledLuma(
+          luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+      // Luma samples get the final coefficient in the formula, but are best
+      // computed all at once before the final row.
+      const int coeff_u =
+          params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+      const int coeff_v =
+          params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+      sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+      sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+      sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+      sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+    }
+
+    WRITE_AUTO_REGRESSION_RESULT(0);
+    WRITE_AUTO_REGRESSION_RESULT(1);
+    WRITE_AUTO_REGRESSION_RESULT(2);
+    WRITE_AUTO_REGRESSION_RESULT(3);
+    if (chroma_width_remainder == 6) {
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+    }
+
+    luma_grain += kLumaWidth << subsampling_y;
+    u_grain += chroma_width;
+    v_grain += chroma_width;
+  } while (++y < chroma_height);
+#undef ACCUMULATE_WEIGHTED_GRAIN
+#undef WRITE_AUTO_REGRESSION_RESULT
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag>
+void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
+                                               void* luma_grain_buffer) {
+  static_assert(auto_regression_coeff_lag > 0, "");
+  const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
+  const uint8_t auto_regression_shift = params.auto_regression_shift;
+
+  int y = kAutoRegressionBorder;
+  auto* luma_grain =
+      static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y;
+  do {
+    // Each row is computed 8 values at a time in the following loop. At the
+    // end of the loop, 4 values remain to write. They are given a special
+    // reduced iteration at the end.
+    int x = kAutoRegressionBorder;
+    do {
+      int pos = 0;
+      int32x4x2_t sum;
+      SetZero(&sum);
+      for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+           ++delta_row) {
+        // These loads may overflow to the next row, but they are never called
+        // on the final row of a grain block. Therefore, they will never exceed
+        // the block boundaries.
+        const int16x8_t src_grain_lo =
+            GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                             auto_regression_coeff_lag);
+        const int16x8_t src_grain_hi =
+            GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                             auto_regression_coeff_lag + 8);
+
+        // A pictorial representation of the auto-regressive filter for
+        // various values of params.auto_regression_coeff_lag. The letter 'O'
+        // represents the current sample. (The filter always operates on the
+        // current sample with filter coefficient 1.) The letters 'X'
+        // represent the neighboring samples that the filter operates on, below
+        // their corresponding "offset" number.
+        //
+        // params.auto_regression_coeff_lag == 3:
+        //   0 1 2 3 4 5 6
+        //   X X X X X X X
+        //   X X X X X X X
+        //   X X X X X X X
+        //   X X X O
+        // params.auto_regression_coeff_lag == 2:
+        //     0 1 2 3 4
+        //     X X X X X
+        //     X X X X X
+        //     X X O
+        // params.auto_regression_coeff_lag == 1:
+        //       0 1 2
+        //       X X X
+        //       X O
+        // params.auto_regression_coeff_lag == 0:
+        //         O
+        // The function relies on the caller to skip the call in the 0 lag
+        // case.
+
+#define ACCUMULATE_WEIGHTED_GRAIN(offset)                           \
+  sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
+                                        auto_regression_coeff_y[pos++], sum)
+        ACCUMULATE_WEIGHTED_GRAIN(0);
+        ACCUMULATE_WEIGHTED_GRAIN(1);
+        ACCUMULATE_WEIGHTED_GRAIN(2);
+        // The horizontal |auto_regression_coeff_lag| loop is replaced with
+        // if-statements to give vextq_s16 an immediate param.
+        if (auto_regression_coeff_lag > 1) {
+          ACCUMULATE_WEIGHTED_GRAIN(3);
+          ACCUMULATE_WEIGHTED_GRAIN(4);
+        }
+        if (auto_regression_coeff_lag > 2) {
+          assert(auto_regression_coeff_lag == 3);
+          ACCUMULATE_WEIGHTED_GRAIN(5);
+          ACCUMULATE_WEIGHTED_GRAIN(6);
+        }
+      }
+      // At this point in the filter, the source addresses and destination
+      // addresses overlap. Because this is an auto-regressive filter, the
+      // higher lanes cannot be computed without the results of the lower lanes.
+      // Each call to WriteFinalAutoRegression incorporates preceding values
+      // on the final row, and writes a single sample. This allows the next
+      // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane)                             \
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
+      luma_grain + x, sum, auto_regression_coeff_y, pos,               \
+      auto_regression_shift)
+
+      WRITE_AUTO_REGRESSION_RESULT(0);
+      WRITE_AUTO_REGRESSION_RESULT(1);
+      WRITE_AUTO_REGRESSION_RESULT(2);
+      WRITE_AUTO_REGRESSION_RESULT(3);
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+      WRITE_AUTO_REGRESSION_RESULT(6);
+      WRITE_AUTO_REGRESSION_RESULT(7);
+      x += 8;
+      // Leave the final four pixels for the special iteration below.
+    } while (x < kLumaWidth - kAutoRegressionBorder - 4);
+
+    // Final 4 pixels in the row.
+    int pos = 0;
+    int32x4x2_t sum;
+    SetZero(&sum);
+    for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+         ++delta_row) {
+      const int16x8_t src_grain_lo = GetSignedSource8(
+          luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
+      const int16x8_t src_grain_hi =
+          GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                           auto_regression_coeff_lag + 8);
+
+      ACCUMULATE_WEIGHTED_GRAIN(0);
+      ACCUMULATE_WEIGHTED_GRAIN(1);
+      ACCUMULATE_WEIGHTED_GRAIN(2);
+      // The horizontal |auto_regression_coeff_lag| loop is replaced with
+      // if-statements to give vextq_s16 an immediate param.
+      if (auto_regression_coeff_lag > 1) {
+        ACCUMULATE_WEIGHTED_GRAIN(3);
+        ACCUMULATE_WEIGHTED_GRAIN(4);
+      }
+      if (auto_regression_coeff_lag > 2) {
+        assert(auto_regression_coeff_lag == 3);
+        ACCUMULATE_WEIGHTED_GRAIN(5);
+        ACCUMULATE_WEIGHTED_GRAIN(6);
+      }
+    }
+    // delta_row == 0
+    WRITE_AUTO_REGRESSION_RESULT(0);
+    WRITE_AUTO_REGRESSION_RESULT(1);
+    WRITE_AUTO_REGRESSION_RESULT(2);
+    WRITE_AUTO_REGRESSION_RESULT(3);
+    luma_grain += kLumaWidth;
+  } while (++y < kLumaHeight);
+
+#undef WRITE_AUTO_REGRESSION_RESULT
+#undef ACCUMULATE_WEIGHTED_GRAIN
+}
+
+template <int bitdepth>
+void InitializeScalingLookupTable_NEON(int num_points,
+                                       const uint8_t point_value[],
+                                       const uint8_t point_scaling[],
+                                       int16_t* scaling_lut,
+                                       const int scaling_lut_length) {
+  static_assert(bitdepth < kBitdepth12,
+                "NEON Scaling lookup table only supports 8bpp and 10bpp.");
+  if (num_points == 0) {
+    memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
+    return;
+  }
+  static_assert(sizeof(scaling_lut[0]) == 2, "");
+  Memset(scaling_lut, point_scaling[0],
+         (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8));
+  const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
+  const int32x4_t rounding = vdupq_n_s32(32768);
+  for (int i = 0; i < num_points - 1; ++i) {
+    const int delta_y = point_scaling[i + 1] - point_scaling[i];
+    const int delta_x = point_value[i + 1] - point_value[i];
+    // |delta| corresponds to b, for the function y = a + b*x.
+    const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+    const int delta4 = delta << 2;
+    // vmull_n_u16 will not work here because |delta| typically exceeds the
+    // range of uint16_t.
+    int32x4_t upscaled_points0 = vmlaq_n_s32(rounding, steps, delta);
+    const int32x4_t line_increment4 = vdupq_n_s32(delta4);
+    // Get the second set of 4 points by adding 4 steps to the first set.
+    int32x4_t upscaled_points1 = vaddq_s32(upscaled_points0, line_increment4);
+    // We obtain the next set of 8 points by adding 8 steps to each of the
+    // current 8 points.
+    const int32x4_t line_increment8 = vshlq_n_s32(line_increment4, 1);
+    const int16x8_t base_point = vdupq_n_s16(point_scaling[i]);
+    int x = 0;
+    // Derive and write 8 values (or 32 values, for 10bpp).
+    do {
+      const int16x4_t interp_points0 = vshrn_n_s32(upscaled_points0, 16);
+      const int16x4_t interp_points1 = vshrn_n_s32(upscaled_points1, 16);
+      const int16x8_t interp_points =
+          vcombine_s16(interp_points0, interp_points1);
+      // The spec guarantees that the max value of |point_value[i]| + x is 255.
+      // Writing 8 values starting at the final table byte, leaves 7 values of
+      // required padding.
+      const int16x8_t full_interp = vaddq_s16(interp_points, base_point);
+      const int x_base = (point_value[i] + x) << (bitdepth - kBitdepth8);
+      if (bitdepth == kBitdepth10) {
+        const int16x8_t next_val = vaddq_s16(
+            base_point,
+            vdupq_n_s16((vgetq_lane_s32(upscaled_points1, 3) + delta) >> 16));
+        const int16x8_t start = full_interp;
+        const int16x8_t end = vextq_s16(full_interp, next_val, 1);
+        // lut[i << 2] = start;
+        // lut[(i << 2) + 1] = start + RightShiftWithRounding(start - end, 2)
+        // lut[(i << 2) + 2] = start +
+        //                      RightShiftWithRounding(2 * (start - end), 2)
+        // lut[(i << 2) + 3] = start +
+        //                      RightShiftWithRounding(3 * (start - end), 2)
+        const int16x8_t delta = vsubq_s16(end, start);
+        const int16x8_t double_delta = vshlq_n_s16(delta, 1);
+        const int16x8_t delta2 = vrshrq_n_s16(double_delta, 2);
+        const int16x8_t delta3 =
+            vrshrq_n_s16(vaddq_s16(delta, double_delta), 2);
+        const int16x8x4_t result = {
+            start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
+            vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
+        Store4QMsanS16(&scaling_lut[x_base], result);
+      } else {
+        vst1q_s16(&scaling_lut[x_base], full_interp);
+      }
+      upscaled_points0 = vaddq_s32(upscaled_points0, line_increment8);
+      upscaled_points1 = vaddq_s32(upscaled_points1, line_increment8);
+      x += 8;
+    } while (x < delta_x);
+  }
+  const int16_t last_point_value = point_value[num_points - 1];
+  const int x_base = last_point_value << (bitdepth - kBitdepth8);
+  Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
+         scaling_lut_length - x_base);
+  if (bitdepth == kBitdepth10 && x_base > 0) {
+    const int start = scaling_lut[x_base - 4];
+    const int end = point_scaling[num_points - 1];
+    const int delta = end - start;
+    scaling_lut[x_base - 3] = start + RightShiftWithRounding(delta, 2);
+    scaling_lut[x_base - 2] = start + RightShiftWithRounding(2 * delta, 2);
+    scaling_lut[x_base - 1] = start + RightShiftWithRounding(3 * delta, 2);
+  }
+}
+
+inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
+                       const int16x8_t high) {
+  const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
+  return vmaxq_s16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+                                   const Pixel* source,
+                                   const int valid_range = 8) {
+  int16_t start_vals[8];
+  static_assert(bitdepth <= kBitdepth10,
+                "NEON Film Grain is not yet implemented for 12bpp.");
+#if LIBGAV1_MSAN
+  if (valid_range < 8) memset(start_vals, 0, sizeof(start_vals));
+#endif
+  for (int i = 0; i < valid_range; ++i) {
+    assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
+    start_vals[i] = scaling_lut[source[i]];
+  }
+  return vld1q_s16(start_vals);
+}
+
+template <int bitdepth>
+inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
+                            const int16x8_t scaling_shift_vect) {
+  if (bitdepth == kBitdepth8) {
+    const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
+    return vrshlq_s16(upscaled_noise, scaling_shift_vect);
+  }
+  // Scaling shift is in the range [8, 11]. The doubling multiply returning high
+  // half is equivalent to a right shift by 15, so |scaling_shift_vect| should
+  // provide a left shift equal to 15 - s, where s is the original shift
+  // parameter.
+  const int16x8_t scaling_up = vshlq_s16(scaling, scaling_shift_vect);
+  return vqrdmulhq_s16(noise, scaling_up);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_NEON(
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+    int scaling_shift, int width, int height, int start_height,
+    const int16_t* scaling_lut_y, const void* source_plane_y,
+    ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_luma);
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(
+      (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+
+  const int safe_width = width & ~15;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x + 8 <= safe_width; x += 8) {
+      // This operation on the unsigned input is safe in 8bpp because the vector
+      // is widened before it is reinterpreted.
+      const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
+      const int16x8_t scaling0 =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      int16x8_t noise =
+          GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling0, scaling_shift_vect);
+      const int16x8_t combined0 = vaddq_s16(orig0, noise);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case, though the gain would be very small.
+      StoreUnsigned8(&out_y_row[x],
+                     vreinterpretq_u16_s16(Clip3(combined0, floor, ceiling)));
+      x += 8;
+
+      // This operation on the unsigned input is safe in 8bpp because the vector
+      // is widened before it is reinterpreted.
+      const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
+      const int16x8_t scaling1 =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
+      const int16x8_t combined1 = vaddq_s16(orig1, noise);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case, though the gain would be very small.
+      StoreUnsigned8(&out_y_row[x],
+                     vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
+    }
+
+    if (x < width) {
+      assert(width - x < 16);
+      if (x < width - 8) {
+        const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
+        const int16x8_t scaling =
+            GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+        int16x8_t noise =
+            GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+        noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+        const int16x8_t combined = vaddq_s16(orig, noise);
+        // In 8bpp, when params_.clip_to_restricted_range == false, we can
+        // replace clipping with vqmovun_s16, but it's not likely to be worth
+        // copying the function for just that case, though the gain would be
+        // very small.
+        StoreUnsigned8(&out_y_row[x],
+                       vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+        x += 8;
+      }
+      const int valid_range_pixels = width - x;
+      const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]);
+      const int16x8_t orig =
+          GetSignedSource8Msan(&in_y_row[x], valid_range_bytes);
+      const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+          scaling_lut_y, &in_y_row[x], valid_range_pixels);
+      int16x8_t noise =
+          GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+      noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+
+      const int16x8_t combined = vaddq_s16(orig, noise);
+      StoreUnsigned8(&out_y_row[x],
+                     vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+    }
+    in_y_row += source_stride_y;
+    out_y_row += dest_stride_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline int16x8_t BlendChromaValsWithCfl(
+    const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+    const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
+    const int16x8_t scaling, const int16x8_t scaling_shift_vect) {
+  const int16x8_t orig = GetSignedSource8(chroma_cursor);
+  int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+  return vaddq_s16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
+    const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift,
+    const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+    Pixel* out_chroma_row, ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  Pixel luma_buffer[16];
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(
+      (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+
+  // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+  // in GetScalingFactors.
+  Pixel average_luma_buffer[8];
+  assert(start_height % 2 == 0);
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x + 8 <= safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const uint16x8_t average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      StoreUnsigned8(average_luma_buffer, average_luma);
+
+      const int16x8_t scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+      const int16x8_t blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+              scaling_shift_vect);
+
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+    }
+
+    if (x < chroma_width) {
+      const int luma_x = x << subsampling_x;
+      const int valid_range_pixels = width - luma_x;
+      const int valid_range_chroma_pixels = chroma_width - x;
+      const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      assert(valid_range_pixels < 16);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+      luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+      const uint16x8_t average_luma = GetAverageLumaMsan(
+          luma_buffer, subsampling_x, valid_range_chroma_pixels << 1);
+
+      StoreUnsigned8(average_luma_buffer, average_luma);
+
+      const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+          scaling_lut, average_luma_buffer, valid_range_chroma_pixels);
+      const int16x8_t blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+              scaling_shift_vect);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_NEON(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+  // Looping over one plane at a time is faster in higher resolutions, despite
+  // re-computing luma.
+  BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+      source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline int16x8_t BlendChromaValsNoCfl(
+    const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+    const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+    const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+    const int16x8_t& offset, int luma_multiplier, int chroma_multiplier,
+    bool restrict_scaling_lookup, int valid_range_pixels = 0) {
+  uint8_t merged_buffer[8];
+  const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
+  const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
+  // Maximum value of |combined_u| is 127*255 = 0x7E81.
+  const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma);
+  // Maximum value of u_offset is (255 << 5) = 0x1FE0.
+  // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
+  const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
+  vst1_u8(merged_buffer, merged);
+
+  const int16x8_t scaling =
+      restrict_scaling_lookup
+          ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer,
+                                                   valid_range_pixels)
+          : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+  int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
+  return vaddq_s16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
+    const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier,
+    const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+    uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+  uint8_t luma_buffer[16];
+  const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x + 8 <= safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const int valid_range_chroma_pixels = chroma_width - x;
+
+      const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
+      const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+          &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier, /*restrict_scaling_lookup=*/false);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can
+      // replace clipping with vqmovun_s16, but the gain would be small.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+    }
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range_pixels = width - luma_x;
+      const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      assert(valid_range_pixels < 16);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+      luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+      const int valid_range_chroma_pixels = chroma_width - x;
+
+      const int16x8_t orig_chroma =
+          GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels);
+      const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+          luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier, /*restrict_scaling_lookup=*/true,
+          valid_range_chroma_pixels);
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+      // End of right edge iteration.
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_NEON(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width,
+                            height, start_height, subsampling_x, subsampling_y,
+                            params.chroma_scaling, offset, multiplier,
+                            luma_multiplier, scaling_lut, in_y, source_stride_y,
+                            in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+inline void WriteOverlapLine8bpp_NEON(
+    const int8_t* LIBGAV1_RESTRICT noise_stripe_row,
+    const int8_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    const int8x8_t grain_coeff, const int8x8_t old_coeff,
+    int8_t* LIBGAV1_RESTRICT noise_image_row) {
+  int x = 0;
+  do {
+    // Note that these reads may exceed noise_stripe_row's width by up to 7
+    // bytes.
+    const int8x8_t source_grain = vld1_s8(noise_stripe_row + x);
+    const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x);
+    const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain);
+    const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old);
+    // Note that this write may exceed noise_image_row's width by up to 7 bytes.
+    vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5));
+    x += 8;
+  } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap8bpp_NEON(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    const int8x8_t first_row_grain_coeff = vdup_n_s8(17);
+    const int8x8_t first_row_old_coeff = vdup_n_s8(27);
+    const int8x8_t second_row_grain_coeff = first_row_old_coeff;
+    const int8x8_t second_row_old_coeff = first_row_grain_coeff;
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine8bpp_NEON(
+          noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+      WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+                                &noise_stripe_prev[(32 + 1) * plane_width],
+                                plane_width, second_row_grain_coeff,
+                                second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+    // Either one partial stripe remains (remaining_height  > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    const int remaining_height = plane_height - y;
+    if (remaining_height <= 0) {
+      return;
+    }
+    const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+    const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine8bpp_NEON(
+        noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+        first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+    if (remaining_height > 1) {
+      WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+                                &noise_stripe_prev[(32 + 1) * plane_width],
+                                plane_width, second_row_grain_coeff,
+                                second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+  } else {  // subsampling_y == 1
+    const int8x8_t first_row_grain_coeff = vdup_n_s8(22);
+    const int8x8_t first_row_old_coeff = vdup_n_s8(23);
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine8bpp_NEON(
+          noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 1>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 2>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 3>;
+
+  // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
+  // Chroma autoregression should never be called when lag is 0 and use_luma
+  // is false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3, true>;
+
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap8bpp_NEON;
+
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_NEON<kBitdepth8>;
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_NEON<kBitdepth8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth8, int8_t, uint8_t>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline void WriteOverlapLine10bpp_NEON(
+    const int16_t* LIBGAV1_RESTRICT noise_stripe_row,
+    const int16_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    const int16x8_t grain_coeff, const int16x8_t old_coeff,
+    int16_t* LIBGAV1_RESTRICT noise_image_row) {
+  int x = 0;
+  do {
+    // Note that these reads may exceed noise_stripe_row's width by up to 7
+    // values.
+    const int16x8_t source_grain = vld1q_s16(noise_stripe_row + x);
+    const int16x8_t source_old = vld1q_s16(noise_stripe_row_prev + x);
+    // Maximum product is 511 * 27 = 0x35E5.
+    const int16x8_t weighted_grain = vmulq_s16(grain_coeff, source_grain);
+    // Maximum sum is 511 * (22 + 23) = 0x59D3.
+    const int16x8_t grain_sum =
+        vmlaq_s16(weighted_grain, old_coeff, source_old);
+    // Note that this write may exceed noise_image_row's width by up to 7
+    // values.
+    const int16x8_t grain = Clip3S16(vrshrq_n_s16(grain_sum, 5),
+                                     vdupq_n_s16(GetGrainMin<kBitdepth10>()),
+                                     vdupq_n_s16(GetGrainMax<kBitdepth10>()));
+    vst1q_s16(noise_image_row + x, grain);
+    x += 8;
+  } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap10bpp_NEON(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<int16_t>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<int16_t>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    const int16x8_t first_row_grain_coeff = vdupq_n_s16(17);
+    const int16x8_t first_row_old_coeff = vdupq_n_s16(27);
+    const int16x8_t second_row_grain_coeff = first_row_old_coeff;
+    const int16x8_t second_row_old_coeff = first_row_grain_coeff;
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine10bpp_NEON(
+          noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+      WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+                                 &noise_stripe_prev[(32 + 1) * plane_width],
+                                 plane_width, second_row_grain_coeff,
+                                 second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+    // Either one partial stripe remains (remaining_height > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    const int remaining_height = plane_height - y;
+    if (remaining_height <= 0) {
+      return;
+    }
+    const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+    const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine10bpp_NEON(
+        noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+        first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+    if (remaining_height > 1) {
+      WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+                                 &noise_stripe_prev[(32 + 1) * plane_width],
+                                 plane_width, second_row_grain_coeff,
+                                 second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+  } else {  // subsampling_y == 1
+    const int16x8_t first_row_grain_coeff = vdupq_n_s16(22);
+    const int16x8_t first_row_old_coeff = vdupq_n_s16(23);
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine10bpp_NEON(
+          noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+    }
+  }
+}
+
+inline int16x8_t BlendChromaValsNoCfl(
+    const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+    const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
+    const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+    const int32x4_t& offset, int luma_multiplier, int chroma_multiplier,
+    bool restrict_scaling_lookup, int valid_range_pixels = 0) {
+  uint16_t merged_buffer[8];
+  const int32x4_t weighted_luma_low =
+      vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
+  const int32x4_t weighted_luma_high =
+      vmull_n_s16(vget_high_s16(average_luma), luma_multiplier);
+  // Maximum value of combined is 127 * 1023 = 0x1FB81.
+  const int32x4_t combined_low =
+      vmlal_n_s16(weighted_luma_low, vget_low_s16(orig), chroma_multiplier);
+  const int32x4_t combined_high =
+      vmlal_n_s16(weighted_luma_high, vget_high_s16(orig), chroma_multiplier);
+  // Maximum value of offset is (255 << 8) = 0xFF00. Offset may be negative.
+  const uint16x4_t merged_low =
+      vqshrun_n_s32(vaddq_s32(offset, combined_low), 6);
+  const uint16x4_t merged_high =
+      vqshrun_n_s32(vaddq_s32(offset, combined_high), 6);
+  const uint16x8_t max_pixel = vdupq_n_u16((1 << kBitdepth10) - 1);
+  vst1q_u16(merged_buffer,
+            vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
+  const int16x8_t scaling =
+      restrict_scaling_lookup
+          ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer,
+                                                     valid_range_pixels)
+          : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut,
+                                                     merged_buffer);
+  const int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  const int16x8_t scaled_noise =
+      ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
+  return vaddq_s16(orig, scaled_noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
+    const Array2D<int16_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier,
+    const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const uint16_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint16_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+    uint16_t* out_chroma_row, ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(15 - scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+  uint16_t luma_buffer[16];
+  // Offset is added before downshifting in order to take advantage of
+  // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
+  const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x + 8 <= safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const int16x8_t average_luma = vreinterpretq_s16_u16(
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x));
+      const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier, /*restrict_scaling_lookup=*/false);
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+    }
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range_pixels = width - luma_x;
+      const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      assert(valid_range_pixels < 16);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+      luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+      const int valid_range_chroma_pixels = chroma_width - x;
+      const int valid_range_chroma_bytes =
+          (chroma_width - x) * sizeof(in_chroma_row[0]);
+      const int16x8_t orig_chroma =
+          GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+
+      const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+          luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier, /*restrict_scaling_lookup=*/true,
+          valid_range_chroma_pixels);
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+      // End of right edge iteration.
+    }
+
+    in_y_row = AddByteStride(in_y_row, source_stride_y << subsampling_y);
+    in_chroma_row = AddByteStride(in_chroma_row, source_stride_chroma);
+    out_chroma_row = AddByteStride(out_chroma_row, dest_stride);
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma10bpp_NEON(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int16_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint16_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint16_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint16_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane10bpp_NEON(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+      luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+      source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 1>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 2>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 3>;
+
+  // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
+  // Chroma autoregression should never be called when lag is 0 and use_luma
+  // is false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 0,
+                                                   true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+                                                   true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+                                                   true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+                                                   true>;
+
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap10bpp_NEON;
+
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_NEON<kBitdepth10>;
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace film_grain
+
+void FilmGrainInit_NEON() {
+  film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h
new file mode 100644 (file)
index 0000000..09596e2
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
new file mode 100644 (file)
index 0000000..9b20e29
--- /dev/null
@@ -0,0 +1,523 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Simplified version of intra_edge.cc:kKernels[][]. Only |strength| 1 and 2 are
+// required.
+constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint8_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+                                         vcreate_u8(0x0f0e0d0c0b0a0908));
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint8_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint8x8_t krn1 = vdup_n_u8(kKernelsNEON[kernel_index][1]);
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint8x16_t src_0 = vld1q_u8(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 15; i += 16) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+      const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+      uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+      sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+      sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+      uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+      sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+      sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+      const uint8x16_t result =
+          vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+      // Load the next row before overwriting. This loads an extra 15 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u8(dst_buffer + i + 15);
+
+      vst1q_u8(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0xf;
+    if (remainder > 1) {
+      const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+      const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+
+      uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+      sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+      sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+      uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+      sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+      sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+      const uint8x16_t result =
+          vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+      const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+      // Create over write mask.
+      const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+      const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+      vst1q_u8(dst_buffer + i, dst_remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint8_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint8x16_t src_0 = vld1q_u8(dst_buffer - 1);
+  uint8x16_t src_1 = vld1q_u8(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 15; i += 16) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+    const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+    const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+    uint16x8_t sum_lo =
+        vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+    const uint16x8_t sum_123_lo = vaddw_u8(
+        vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+    sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+    uint16x8_t sum_hi =
+        vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+    const uint16x8_t sum_123_hi =
+        vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+                 vget_high_u8(src_3));
+    sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+    const uint8x16_t result =
+        vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+    src_0 = vld1q_u8(dst_buffer + i + 14);
+    src_1 = vld1q_u8(dst_buffer + i + 15);
+
+    vst1q_u8(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0xf;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+    const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+    const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+    uint16x8_t sum_lo =
+        vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+    const uint16x8_t sum_123_lo = vaddw_u8(
+        vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+    sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+    uint16x8_t sum_hi =
+        vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+    const uint16x8_t sum_123_hi =
+        vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+                 vget_high_u8(src_3));
+    sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+    const uint8x16_t result =
+        vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+    const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+    // Create over write mask.
+    const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+    const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+    vst1q_u8(dst_buffer + i, dst_remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+// (-|src0| + |src1| * 9 + |src2| * 9 - |src3|) >> 4
+uint8x8_t Upsample(const uint8x8_t src0, const uint8x8_t src1,
+                   const uint8x8_t src2, const uint8x8_t src3) {
+  const uint16x8_t middle = vmulq_n_u16(vaddl_u8(src1, src2), 9);
+  const uint16x8_t ends = vaddl_u8(src0, src3);
+  const int16x8_t sum =
+      vsubq_s16(vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(ends));
+  return vqrshrun_n_s16(sum, 4);
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+  // This is OK because we don't read this value for |size| 4 or 8 but if we
+  // write |pixel_buffer[size]| and then vld() it, that seems to introduce
+  // some latency.
+  pixel_buffer[-2] = pixel_buffer[-1];
+  if (size == 4) {
+    // This uses one load and two vtbl() which is better than 4x Load{Lo,Hi}4().
+    const uint8x8_t src = vld1_u8(pixel_buffer - 1);
+    // The outside values are negated so put those in the same vector.
+    const uint8x8_t src03 = vtbl1_u8(src, vcreate_u8(0x0404030202010000));
+    // Reverse |src1| and |src2| so we can use |src2| for the interleave at the
+    // end.
+    const uint8x8_t src21 = vtbl1_u8(src, vcreate_u8(0x0302010004030201));
+
+    const uint16x8_t middle = vmull_u8(src21, vdup_n_u8(9));
+    const int16x8_t half_sum = vsubq_s16(
+        vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(vmovl_u8(src03)));
+    const int16x4_t sum =
+        vadd_s16(vget_low_s16(half_sum), vget_high_s16(half_sum));
+    const uint8x8_t result = vqrshrun_n_s16(vcombine_s16(sum, sum), 4);
+
+    vst1_u8(pixel_buffer - 1, InterleaveLow8(result, src21));
+    return;
+  }
+  if (size == 8) {
+    // Likewise, one load + multiple vtbls seems preferred to multiple loads.
+    const uint8x16_t src = vld1q_u8(pixel_buffer - 1);
+    const uint8x8_t src0 = VQTbl1U8(src, vcreate_u8(0x0605040302010000));
+    const uint8x8_t src1 = vget_low_u8(src);
+    const uint8x8_t src2 = VQTbl1U8(src, vcreate_u8(0x0807060504030201));
+    const uint8x8_t src3 = VQTbl1U8(src, vcreate_u8(0x0808070605040302));
+
+    const uint8x8x2_t output = {Upsample(src0, src1, src2, src3), src2};
+    vst2_u8(pixel_buffer - 1, output);
+    return;
+  }
+  assert(size == 12 || size == 16);
+  // Extend the input borders to avoid branching later.
+  pixel_buffer[size] = pixel_buffer[size - 1];
+  const uint8x16_t src0 = vld1q_u8(pixel_buffer - 2);
+  const uint8x16_t src1 = vld1q_u8(pixel_buffer - 1);
+  const uint8x16_t src2 = vld1q_u8(pixel_buffer);
+  const uint8x16_t src3 = vld1q_u8(pixel_buffer + 1);
+
+  const uint8x8_t result_lo = Upsample(vget_low_u8(src0), vget_low_u8(src1),
+                                       vget_low_u8(src2), vget_low_u8(src3));
+
+  const uint8x8x2_t output_lo = {result_lo, vget_low_u8(src2)};
+  vst2_u8(pixel_buffer - 1, output_lo);
+
+  const uint8x8_t result_hi = Upsample(vget_high_u8(src0), vget_high_u8(src1),
+                                       vget_high_u8(src2), vget_high_u8(src3));
+
+  if (size == 12) {
+    vst1_u8(pixel_buffer + 15, InterleaveLow8(result_hi, vget_high_u8(src2)));
+  } else /* size == 16 */ {
+    const uint8x8x2_t output_hi = {result_hi, vget_high_u8(src2)};
+    vst2_u8(pixel_buffer + 15, output_hi);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint16_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+    const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint16x8_t src_0 = vld1q_u16(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 7; i += 8) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      // Load the next row before overwriting. This loads an extra 7 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u16(dst_buffer + i + 7);
+      vst1q_u16(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0x7;
+    if (remainder > 1) {
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+      const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+      vst1q_u16(dst_buffer + i, dst_remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint16_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+  uint16x8_t src_1 = vld1q_u16(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 7; i += 8) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+    // Load the next before overwriting.
+    src_0 = vld1q_u16(dst_buffer + i + 6);
+    src_1 = vld1q_u16(dst_buffer + i + 7);
+
+    vst1q_u16(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0x7;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+    const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+    const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+    vst1q_u16(dst_buffer + i, dst_remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
+
+  // Extend first/last samples
+  pixel_buffer[-2] = pixel_buffer[-1];
+  pixel_buffer[size] = pixel_buffer[size - 1];
+
+  const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+  const int16x8_t src_hi =
+      vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+  const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+  const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+  int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+  sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+  sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+  sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+  uint16x8x2_t result_lo;
+  result_lo.val[0] =
+      vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+                vdupq_n_u16((1 << kBitdepth10) - 1));
+  result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+  if (size > 8) {
+    const int16x8_t src_hi_extra =
+        vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+    const int16x8_t src9_hi_extra =
+        vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+    int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+    sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+    sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+    sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+    uint16x8x2_t result_hi;
+    result_hi.val[0] =
+        vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+                  vdupq_n_u16((1 << kBitdepth10) - 1));
+    result_hi.val[1] =
+        vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+    vst2q_u16(pixel_buffer - 1, result_lo);
+    vst2q_u16(pixel_buffer + 15, result_hi);
+  } else {
+    vst2q_u16(pixel_buffer - 1, result_lo);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
new file mode 100644 (file)
index 0000000..28e3494
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
new file mode 100644 (file)
index 0000000..ad39947
--- /dev/null
@@ -0,0 +1,1327 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Divide by the number of elements.
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
+  return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
+}
+
+// Subtract |val| from every element in |a|.
+inline void BlockSubtract(const uint32_t val,
+                          int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+                          const int width, const int height) {
+  assert(val <= INT16_MAX);
+  const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
+
+  for (int y = 0; y < height; ++y) {
+    if (width == 4) {
+      const int16x4_t b = vld1_s16(a[y]);
+      vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v)));
+    } else if (width == 8) {
+      const int16x8_t b = vld1q_s16(a[y]);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+    } else if (width == 16) {
+      const int16x8_t b = vld1q_s16(a[y]);
+      const int16x8_t c = vld1q_s16(a[y] + 8);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+      vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+    } else /* block_width == 32 */ {
+      const int16x8_t b = vld1q_s16(a[y]);
+      const int16x8_t c = vld1q_s16(a[y] + 8);
+      const int16x8_t d = vld1q_s16(a[y] + 16);
+      const int16x8_t e = vld1q_s16(a[y] + 24);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+      vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+      vst1q_s16(a[y] + 16, vsubq_s16(d, val_v));
+      vst1q_s16(a[y] + 24, vsubq_s16(e, val_v));
+    }
+  }
+}
+
+namespace low_bitdepth {
+namespace {
+
+template <int block_width, int block_height>
+void CflSubsampler420_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t sum;
+  if (block_width == 4) {
+    assert(max_luma_width >= 8);
+    uint32x2_t running_sum = vdup_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x8_t row0 = vld1_u8(src);
+      const uint8x8_t row1 = vld1_u8(src + stride);
+
+      uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1);
+      sum_row = vshl_n_u16(sum_row, 1);
+      running_sum = vpadal_u16(running_sum, sum_row);
+      vst1_s16(luma[y], vreinterpret_s16_u16(sum_row));
+
+      if (y << 1 < max_luma_height - 2) {
+        // Once this threshold is reached the loop could be simplified.
+        src += stride << 1;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else if (block_width == 8) {
+    const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+    const uint16x8_t x_max_index =
+        vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+    const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x16_t row0 = vld1q_u8(src);
+      const uint8x16_t row1 = vld1q_u8(src + stride);
+      const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+      const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+      const uint16x8_t final_sum_row =
+          vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
+
+      running_sum = vpadalq_u16(running_sum, final_sum_row);
+
+      if (y << 1 < max_luma_height - 2) {
+        src += stride << 1;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else /* block_width >= 16 */ {
+    const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      // Calculate the 2x2 sum at the max_luma offset
+      const uint8_t a00 = src[max_luma_width - 2];
+      const uint8_t a01 = src[max_luma_width - 1];
+      const uint8_t a10 = src[max_luma_width - 2 + stride];
+      const uint8_t a11 = src[max_luma_width - 1 + stride];
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_n_u16(static_cast<uint16_t>((a00 + a01 + a10 + a11) << 1));
+      uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+
+      ptrdiff_t src_x_offset = 0;
+      for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+        const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+        const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+        const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+        const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+        const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+        const uint16x8_t final_sum_row =
+            vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
+
+        running_sum = vpadalq_u16(running_sum, final_sum_row);
+        x_index = vaddq_u16(x_index, vdupq_n_u16(16));
+      }
+
+      if (y << 1 < max_luma_height - 2) {
+        src += stride << 1;
+      }
+    }
+    sum = SumVector(running_sum);
+  }
+
+  const uint32_t average = Average(sum, block_width, block_height);
+  BlockSubtract(average, luma, block_width, block_height);
+}
+
+template <int block_width, int block_height>
+void CflSubsampler444_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t sum;
+  if (block_width == 4) {
+    assert(max_luma_width >= 4);
+    assert(max_luma_height <= block_height);
+    assert((max_luma_height % 2) == 0);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+    uint8x8_t row = vdup_n_u8(0);
+
+    uint16x8_t row_shifted;
+    int y = 0;
+    do {
+      row = Load4<0>(src, row);
+      row = Load4<1>(src + stride, row);
+      if (y < (max_luma_height - 1)) {
+        src += stride << 1;
+      }
+
+      row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+      y += 2;
+    } while (y < max_luma_height);
+
+    row_shifted =
+        vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+    for (; y < block_height; y += 2) {
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+    }
+
+    sum = SumVector(running_sum);
+  } else if (block_width == 8) {
+    const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7};
+    const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1);
+    const uint8x8_t x_mask = vclt_u8(x_index, x_max_index);
+
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]);
+      const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max);
+
+      const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted));
+
+      if (y < max_luma_height - 1) {
+        src += stride;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else /* block_width >= 16 */ {
+    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      uint8x16_t x_index = {0, 1, 2,  3,  4,  5,  6,  7,
+                            8, 9, 10, 11, 12, 13, 14, 15};
+      const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]);
+      for (int x = 0; x < block_width; x += 16) {
+        const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+        const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max);
+
+        const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3);
+        const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3);
+        running_sum = vpadalq_u16(running_sum, row_shifted_low);
+        running_sum = vpadalq_u16(running_sum, row_shifted_high);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low));
+        vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high));
+
+        x_index = vaddq_u8(x_index, vdupq_n_u8(16));
+      }
+      if (y < max_luma_height - 1) {
+        src += stride;
+      }
+    }
+    sum = SumVector(running_sum);
+  }
+
+  const uint32_t average = Average(sum, block_width, block_height);
+  BlockSubtract(average, luma, block_width, block_height);
+}
+
+// Saturate |dc + ((alpha * luma) >> 6))| to uint8_t.
+inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
+                          const int16x8_t dc) {
+  const int16x8_t la = vmulq_n_s16(luma, alpha);
+  // Subtract the sign bit to round towards zero.
+  const int16x8_t sub_sign = vsraq_n_s16(la, la, 15);
+  // Shift and accumulate.
+  const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
+  return vqmovun_s16(result);
+}
+
+// The range of luma/alpha is not really important because it gets saturated to
+// uint8_t. Saturated int16_t >> 6 outranges uint8_t.
+template <int block_height>
+inline void CflIntraPredictor4xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const uint8x8_t sum =
+        Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
+    StoreLo4(dst, sum);
+    dst += stride;
+    StoreHi4(dst, sum);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint8x8_t sum = Combine8(luma_row, alpha, dc);
+    vst1_u8(dst, sum);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor16xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+    const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+    vst1_u8(dst, sum_0);
+    vst1_u8(dst + 8, sum_1);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor32xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+    const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+    const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
+    const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
+    vst1_u8(dst, sum_0);
+    vst1_u8(dst + 8, sum_1);
+    vst1_u8(dst + 16, sum_2);
+    vst1_u8(dst + 24, sum_3);
+    dst += stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 8>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 16>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 4>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 8>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 16>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 8>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 16>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 8>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 16>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 8>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 16>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 4>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 8>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 16>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 8>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 16>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 8>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 16>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 32>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+  const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+  const uint32x4_t b = vdupq_lane_u32(a, 1);
+  return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+  const uint16x4_t a = vget_high_u16(row);
+  const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+  return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+  const int16x4_t a = vget_high_s16(row);
+  const int16x8_t b = vdupq_lane_s16(a, 0x3);
+  return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+  vst1_s16(luma_ptr + kCflLumaBufferStride,
+           vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+  return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+  return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint16x4_t sum = vdup_n_u16(0);
+  uint16x4_t samples[2];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1_u16(src);
+    samples[1] = vld1_u16(src + src_stride);
+    src += src_stride << 1;
+    sum = vadd_u16(sum, samples[0]);
+    sum = vadd_u16(sum, samples[1]);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples[1] = vshl_n_u16(samples[1], 1);
+    do {
+      sum = vadd_u16(sum, samples[1]);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x4_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1_s16(ssrc);
+    ssample = vshl_n_s16(ssample, 3);
+    vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples;
+  int y = visible_height;
+
+  do {
+    samples = vld1q_u16(src);
+    src += src_stride;
+    sum = vpadalq_u16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = vpadalq_u16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(sum), block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1q_s16(ssrc);
+    ssample = vshlq_n_s16(ssample, 3);
+    vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1q_u16(src);
+    samples[1] =
+        (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+                                          : LastRowResult(samples[2]);
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    sum = vpadalq_u16(sum, inner_sum);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    do {
+      sum = vpadalq_u16(sum, inner_sum);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(sum), block_width_log2 + block_height_log2 - 3);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssamples_ext = vdupq_n_s16(0);
+  int16x8_t ssamples[4];
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        ssamples[idx] = vld1q_s16(&ssrc[x]);
+        ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+        ssamples_ext = ssamples[idx];
+      } else {
+        ssamples[idx] = LastRowResult(ssamples_ext);
+      }
+      vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+    }
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row0 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row1 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+    const uint16x8_t samples_row2 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row3 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+    uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const uint16x8_t samples_row4 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row5 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+    const uint16x8_t samples_row6 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row7 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+    sum =
+        vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  const uint16x4_t final_fill =
+      vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x4_t samples = vld1_s16(luma_ptr);
+    vst1_s16(luma_ptr, vsub_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    src += src_stride;
+    const uint16x8_t samples_row10 = vld1q_u16(src);
+    const uint16x8_t samples_row11 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row10);
+    src += src_stride;
+    const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+    uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row20 = vld1q_u16(src);
+    const uint16x8_t samples_row21 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row20);
+    src += src_stride;
+    const uint16x8_t samples_row30 = vld1q_u16(src);
+    const uint16x8_t samples_row31 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row30);
+    src += src_stride;
+    const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+    const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row40 = vld1q_u16(src);
+    const uint16x8_t samples_row41 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row40);
+    src += src_stride;
+    const uint16x8_t samples_row50 = vld1q_u16(src);
+    const uint16x8_t samples_row51 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row50);
+    src += src_stride;
+    const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+    const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row60 = vld1q_u16(src);
+    const uint16x8_t samples_row61 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row60);
+    src += src_stride;
+    const uint16x8_t samples_row70 = vld1q_u16(src);
+    const uint16x8_t samples_row71 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row70);
+    src += src_stride;
+    const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+    const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const uint16x8_t final_fill =
+      vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum =
+      vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+  for (y = luma_height; y < block_height; ++y) {
+    vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+                                                        source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+                                                         source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  uint16x8_t final_fill0, final_fill1;
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    const uint16x8_t samples_row02 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src + 16)
+                                         : LastRowSamples(samples_row01);
+    const uint16x8_t samples_row03 = (max_luma_width == 32)
+                                         ? vld1q_u16(src + 24)
+                                         : LastRowSamples(samples_row02);
+    const uint16x8_t samples_row10 = vld1q_u16(src_next);
+    const uint16x8_t samples_row11 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src_next + 8)
+                                         : LastRowSamples(samples_row10);
+    const uint16x8_t samples_row12 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src_next + 16)
+                                         : LastRowSamples(samples_row11);
+    const uint16x8_t samples_row13 = (max_luma_width == 32)
+                                         ? vld1q_u16(src_next + 24)
+                                         : LastRowSamples(samples_row12);
+    const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+    const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+    const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+    final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+    final_sum = vpadalq_u16(final_sum, sum);
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const uint16x8_t wide_fill = LastRowResult(final_fill1);
+      final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    uint32x4_t wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.  (a << 2) = (a + a) << 1.
+      wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+    }
+    const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+    const uint32x4_t final_fill_to_sum = vaddl_u16(
+        vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+    do {
+      vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+      vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+      if (block_width_log2 == 5) {
+        final_sum = vaddq_u32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_width_log2 + block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples0 = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+    const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+    const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+    vst1q_s16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const int16x8_t wide_fill = LastRowResult(final_row_result);
+      vst1q_s16(luma_ptr + 16, wide_fill);
+      vst1q_s16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+                           const int16x8_t alpha_signed, const int16x8_t dc,
+                           const uint16x8_t max_value) {
+  const int16x8_t luma_abs = vabsq_s16(luma);
+  const int16x8_t luma_alpha_sign =
+      vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+  // (alpha * luma) >> 6
+  const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+  // Convert back to signed values.
+  const int16x8_t la =
+      vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+  const int16x8_t result = vaddq_s16(la, dc);
+  const int16x8_t zero = vdupq_n_s16(0);
+  // Clip.
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+    const uint16x8_t sum =
+        Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+    vst1_u16(dst, vget_low_u16(sum));
+    dst += dst_stride;
+    vst1_u16(dst, vget_high_u16(sum));
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint16x8_t sum =
+        Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_2 =
+        Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_3 =
+        Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    vst1q_u16(dst + 16, sum_2);
+    vst1q_u16(dst + 24, sum_3);
+    dst += dst_stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 5>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644 (file)
index 0000000..b4f983a
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
new file mode 100644 (file)
index 0000000..d36ef5f
--- /dev/null
@@ -0,0 +1,2311 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+                               const uint8x8_t a_weight,
+                               const uint8x8_t b_weight) {
+  const uint16x8_t a_product = vmull_u8(a, a_weight);
+  const uint16x8_t sum = vmlal_u8(a_product, b, b_weight);
+
+  return vrshrn_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// For vertical operations the weights are one constant value.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+                               const uint8_t weight) {
+  return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
+}
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step, const uint8x8_t right_step,
+                         uint8x8_t* left, uint8x8_t* right) {
+  const uint8x16_t mixed = vld1q_u8(source);
+  *left = VQTbl1U8(mixed, left_step);
+  *right = VQTbl1U8(mixed, right_step);
+}
+
+// Handle signed step arguments by ignoring the sign. Negative values are
+// considered out of range and overwritten later.
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+                         const int8x8_t left_step, const int8x8_t right_step,
+                         uint8x8_t* left, uint8x8_t* right) {
+  LoadStepwise(source, vreinterpret_u8_s8(left_step),
+               vreinterpret_u8_s8(right_step), left, right);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint8_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep, const bool upsampled) {
+  assert(width == 4 || width == 8);
+
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (width + height - 1) << upsample_shift;
+  const int8x8_t max_base = vdup_n_s8(max_base_x);
+  const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+  const int8x8_t all = vcreate_s8(0x0706050403020100);
+  const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+  const int8x8_t base_step = upsampled ? even : all;
+  const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+
+  int top_x = xstep;
+  int y = 0;
+  do {
+    const int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dst, top[max_base_x], 4 /* width */);
+        dst += stride;
+      }
+      return;
+    }
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+    // Zone2 uses negative values for xstep. Use signed values to compare
+    // |top_base_x| to |max_base_x|.
+    const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+    const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+    // 4 wide subsamples the output. 8 wide subsamples the input.
+    if (width == 4) {
+      const uint8x8_t left_values = vld1_u8(top + top_base_x);
+      const uint8x8_t right_values = RightShiftVector<8>(left_values);
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+
+      // If |upsampled| is true then extract every other value for output.
+      const uint8x8_t value_stepped =
+          vtbl1_u8(value, vreinterpret_u8_s8(base_step));
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value_stepped, top_max_base);
+
+      StoreLo4(dst, masked_value);
+    } else /* width == 8 */ {
+      uint8x8_t left_values, right_values;
+      // WeightedBlend() steps up to Q registers. Downsample the input to avoid
+      // doing extra calculations.
+      LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
+                   &right_values);
+
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value, top_max_base);
+
+      vst1_u8(dst, masked_value);
+    }
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint8_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep, const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (width + height - 1) << upsample_shift;
+  const int8x8_t max_base = vdup_n_s8(max_base_x);
+  const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+  const int8x8_t all = vcreate_s8(0x0706050403020100);
+  const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+  const int8x8_t base_step = upsampled ? even : all;
+  const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+  const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
+
+  int top_x = xstep;
+  int y = 0;
+  do {
+    const int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dst, top[max_base_x], 4 /* width */);
+        dst += stride;
+      }
+      return;
+    }
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+    // Zone2 uses negative values for xstep. Use signed values to compare
+    // |top_base_x| to |max_base_x|.
+    int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+    int x = 0;
+    do {
+      const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+      // Extract the input values based on |upsampled| here to avoid doing twice
+      // as many calculations.
+      uint8x8_t left_values, right_values;
+      LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
+                   &right_values);
+
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value, top_max_base);
+
+      vst1_u8(dst + x, masked_value);
+
+      base_v = vadd_s8(base_v, block_step);
+      x += 8;
+    } while (x < width);
+    top_x += xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  assert(xstep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint8_t* top_ptr = top + 1;
+    int y = 0;
+    do {
+      memcpy(dst, top_ptr, width);
+      memcpy(dst + stride, top_ptr + 1, width);
+      memcpy(dst + 2 * stride, top_ptr + 2, width);
+      memcpy(dst + 3 * stride, top_ptr + 3, width);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y += 4;
+    } while (y < height);
+  } else if (width == 4) {
+    DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
+  } else if (xstep > 51) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
+    // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
+    // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
+    // Shallower angles have greater xstep values.
+    assert(!upsampled_top);
+    const int max_base_x = ((width + height) - 1);
+    const uint8x8_t max_base = vdup_n_u8(max_base_x);
+    const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+    const uint8x8_t block_step = vdup_n_u8(8);
+
+    int top_x = xstep;
+    int y = 0;
+    do {
+      const int top_base_x = top_x >> 6;
+      const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+      uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
+      int x = 0;
+      // Only calculate a block of 8 when at least one of the output values is
+      // within range. Otherwise it can read off the end of |top|.
+      const int must_calculate_width =
+          std::min(width, max_base_x - top_base_x + 7) & ~7;
+      for (; x < must_calculate_width; x += 8) {
+        const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
+
+        // Since these |xstep| values can not be upsampled the load is
+        // simplified.
+        const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
+        const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
+        const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+        const uint8x8_t masked_value =
+            vbsl_u8(max_base_mask, value, top_max_base);
+
+        vst1_u8(dst + x, masked_value);
+        base_v = vadd_u8(base_v, block_step);
+      }
+      memset(dst + x, top[max_base_x], width - x);
+      dst += stride;
+      top_x += xstep;
+    } while (++y < height);
+  } else {
+    DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
+  }
+}
+
+// Process 4 or 8 |width| by 4 or 8 |height|.
+template <int width>
+inline void DirectionalZone3_WxH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y,
+    const int ystep, const int upsample_shift) {
+  assert(width == 4 || width == 8);
+  assert(height == 4 || height == 8);
+  const int scale_bits = 6 - upsample_shift;
+
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+
+  // Limited improvement for 8x8. ~20% faster for 64x64.
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+  const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+  const uint8x8_t base_step = upsample_shift ? even : all;
+  const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+  uint8_t* dst = dest;
+  uint8x8_t left_v[8], right_v[8], value_v[8];
+  const uint8_t* const left = left_column;
+
+  const int index_0 = base_left_y;
+  LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
+               &left_v[0], &right_v[0]);
+  value_v[0] = WeightedBlend(left_v[0], right_v[0],
+                             ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_1 = base_left_y + ystep;
+  LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
+               &left_v[1], &right_v[1]);
+  value_v[1] = WeightedBlend(left_v[1], right_v[1],
+                             ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_2 = base_left_y + ystep * 2;
+  LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
+               &left_v[2], &right_v[2]);
+  value_v[2] = WeightedBlend(left_v[2], right_v[2],
+                             ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_3 = base_left_y + ystep * 3;
+  LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
+               &left_v[3], &right_v[3]);
+  value_v[3] = WeightedBlend(left_v[3], right_v[3],
+                             ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_4 = base_left_y + ystep * 4;
+  LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
+               &left_v[4], &right_v[4]);
+  value_v[4] = WeightedBlend(left_v[4], right_v[4],
+                             ((index_4 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_5 = base_left_y + ystep * 5;
+  LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
+               &left_v[5], &right_v[5]);
+  value_v[5] = WeightedBlend(left_v[5], right_v[5],
+                             ((index_5 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_6 = base_left_y + ystep * 6;
+  LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
+               &left_v[6], &right_v[6]);
+  value_v[6] = WeightedBlend(left_v[6], right_v[6],
+                             ((index_6 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_7 = base_left_y + ystep * 7;
+  LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
+               &left_v[7], &right_v[7]);
+  value_v[7] = WeightedBlend(left_v[7], right_v[7],
+                             ((index_7 << upsample_shift) & 0x3F) >> 1);
+
+  // 8x8 transpose.
+  const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
+                                   vcombine_u8(value_v[1], value_v[5]));
+  const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
+                                   vcombine_u8(value_v[3], value_v[7]));
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  if (width == 4) {
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+    if (height == 4) return;
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+  } else {
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+    if (height == 4) return;
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+  }
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone2FromLeftCol_WxH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+    const int upsample_shift) {
+  assert(width == 4 || width == 8);
+
+  // The shift argument must be a constant.
+  int16x8_t offset_y, shift_upsampled = left_y;
+  if (upsample_shift) {
+    offset_y = vshrq_n_s16(left_y, 5);
+    shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshrq_n_s16(left_y, 6);
+  }
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the right.
+  // With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by doing
+  // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+  // registers as input which would allow for cumulative offsets of 32.
+  const int16x8_t sampler =
+      vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
+  const uint8x8_t left_values = vqmovun_s16(sampler);
+  const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
+
+  const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+  const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
+  const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
+
+  int y = 0;
+  do {
+    uint8x8_t src_left, src_right;
+    LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
+                 left_values, right_values, &src_left, &src_right);
+    const uint8x8_t val =
+        WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
+
+    if (width == 4) {
+      StoreLo4(dst, val);
+    } else {
+      vst1_u8(dst, val);
+    }
+    dst += stride;
+  } while (++y < height);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1Blend_WxH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep, const int upsample_shift) {
+  assert(width == 4 || width == 8);
+
+  const int scale_bits_x = 6 - upsample_shift;
+
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+  const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+  const uint8x8_t base_step = upsample_shift ? even : all;
+  const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+  int y = 0;
+  do {
+    const uint8_t* const src = top_row + (top_x >> scale_bits_x);
+    uint8x8_t left, right;
+    LoadStepwise(src, base_step, right_step, &left, &right);
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint8x8_t val = WeightedBlend(left, right, shift);
+
+    uint8x8_t dst_blend = vld1_u8(dest);
+    // |zone_bounds| values can be negative.
+    uint8x8_t blend =
+        vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
+    uint8x8_t output = vbsl_u8(blend, val, dst_blend);
+
+    if (width == 4) {
+      StoreLo4(dest, output);
+    } else {
+      vst1_u8(dest, output);
+    }
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (++y < height);
+}
+
+//  7.11.2.4 (8) 90 < angle > 180
+//  The strategy for these functions (4xH and 8+xH) is to know how many blocks
+//  can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+//  then handle only blocks that take from |left_ptr|. Additionally, a fast
+//  index-shuffle approach is used for pred values from |left_column| in
+//  sections that permit it.
+inline void DirectionalZone2_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const bool upsampled_top,
+    const bool upsampled_left) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
+  // it's only 4, it will be finished in the first iteration.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. The following values need the full ystep as a relative offset.
+  const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+  const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  if (min_top_only_x > 0) {
+    // Round down to the nearest multiple of 8 (or 4, if height is 4).
+    const int max_top_only_y =
+        std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+    DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
+                            upsampled_top);
+
+    if (max_top_only_y == height) return;
+
+    int y = max_top_only_y;
+    dst += stride * y;
+    const int xstep_y = xstep * y;
+
+    // All rows from |min_left_only_y| down for this set of columns only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min((4 << 6) / xstep, height);
+    int xstep_bounds = xstep_bounds_base + xstep_y;
+    int top_x = -xstep - xstep_y;
+
+    // +8 increment is OK because if height is 4 this only goes once.
+    for (; y < min_left_only_y;
+         y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          left_y, upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+                                   xstep_bounds, top_x, xstep,
+                                   upsample_top_shift);
+    }
+
+    // Loop over y for left_only rows.
+    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+    for (; y < height; y += 8, dst += stride8) {
+      DirectionalZone3_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          base_left_y, -ystep, upsample_left_shift);
+    }
+  } else {
+    DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
+                            upsampled_top);
+  }
+}
+
+template <bool shuffle_left_column>
+inline void DirectionalZone2_8xH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const int x, const int left_offset,
+    const int xstep_bounds_base, const int16x8_t left_y,
+    const bool upsampled_top, const bool upsampled_left) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  // Cover 8x4 case.
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  uint8_t* dst_x = dst + x;
+  // Round down to the nearest multiple of 8 (or 4, if height is 4).
+  const int max_top_only_y =
+      std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+  DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+                          top_row + (x << upsample_top_shift), -xstep,
+                          upsampled_top);
+
+  if (max_top_only_y == height) return;
+
+  int y = max_top_only_y;
+  dst_x += stride * y;
+  const int xstep_y = xstep * y;
+
+  // All rows from |min_left_only_y| down for this set of columns only need
+  // |left_column| to compute. Round up to the nearest 8.
+  const int min_left_only_y =
+      Align(std::min(((x + 8) << 6) / xstep, height), 8);
+  int xstep_bounds = xstep_bounds_base + xstep_y;
+  int top_x = -xstep - xstep_y;
+
+  const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+  for (; y < min_left_only_y;
+       y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+    if (shuffle_left_column) {
+      DirectionalZone2FromLeftCol_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsample_left_shift);
+    } else {
+      DirectionalZone3_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep, upsample_left_shift);
+    }
+
+    DirectionalZone1Blend_WxH<8>(
+        dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+        xstep_bounds, top_x, xstep, upsample_top_shift);
+  }
+
+  // Loop over y for left_only rows.
+  for (; y < height; y += 8, dst_x += stride8) {
+    DirectionalZone3_WxH<8>(
+        dst_x, stride, min_height,
+        left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+        -ystep, upsample_left_shift);
+  }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_WxH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  const int ystep8 = ystep << 3;
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+  int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+
+  // For ystep > 90, at least two sets of 8 columns can be fully computed from
+  // top_row only.
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+  // Analysis finds that, for most angles (ystep < 132), all segments that use
+  // both top_row and left_column can compute from left_column using byte
+  // shuffles from a single vector. For steeper angles, the shuffle is also
+  // fully reliable when x >= 32.
+  const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+  const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  int x = 0;
+  for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
+                                xstep, ystep, x, left_offset, xstep_bounds_base,
+                                left_y, upsampled_top, upsampled_left);
+  }
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
+                               ystep, x, left_offset, xstep_bounds_base, left_y,
+                               upsampled_top, upsampled_left);
+  }
+  if (x < width) {
+    const int upsample_top_shift = static_cast<int>(upsampled_top);
+    DirectionalZone1_WxH(dst + x, stride, width - x, height,
+                         top_row + (x << upsample_top_shift), -xstep,
+                         upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+#if LIBGAV1_MSAN
+  memset(top_buffer, 0, sizeof(top_buffer));
+  memset(left_buffer, 0, sizeof(left_buffer));
+#endif  // LIBGAV1_MSAN
+
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (width == 4) {
+    DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
+                         upsampled_top, upsampled_left);
+  } else {
+    DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
+                         ystep, upsampled_top, upsampled_left);
+  }
+}
+
+void DirectionalIntraPredictorZone3_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+
+  if (width == 4 || height == 4) {
+    // This block can handle all sizes but the specializations for other sizes
+    // are faster.
+    const uint8x8_t all = vcreate_u8(0x0706050403020100);
+    const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+    const uint8x8_t base_step_v = upsampled_left ? even : all;
+    const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
+
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        auto* dst = static_cast<uint8_t*>(dest);
+        dst += y * stride + x;
+        uint8x8_t left_v[4], right_v[4], value_v[4];
+        const int ystep_base = ystep * x;
+        const int offset = y * base_step;
+
+        const int index_0 = ystep_base + ystep * 1;
+        LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
+                     right_step, &left_v[0], &right_v[0]);
+        value_v[0] = WeightedBlend(left_v[0], right_v[0],
+                                   ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_1 = ystep_base + ystep * 2;
+        LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
+                     right_step, &left_v[1], &right_v[1]);
+        value_v[1] = WeightedBlend(left_v[1], right_v[1],
+                                   ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_2 = ystep_base + ystep * 3;
+        LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
+                     right_step, &left_v[2], &right_v[2]);
+        value_v[2] = WeightedBlend(left_v[2], right_v[2],
+                                   ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_3 = ystep_base + ystep * 4;
+        LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
+                     right_step, &left_v[3], &right_v[3]);
+        value_v[3] = WeightedBlend(left_v[3], right_v[3],
+                                   ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+        // 8x4 transpose.
+        const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
+        const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
+
+        const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
+                                         vreinterpret_u16_u8(b1.val[0]));
+        const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
+                                         vreinterpret_u16_u8(b1.val[1]));
+
+        StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
+
+        if (height > 4) {
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
+        }
+        x += 4;
+      } while (x < width);
+      y += 8;
+    } while (y < height);
+  } else {  // 8x8 at a time.
+    // Limited improvement for 8x8. ~20% faster for 64x64.
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        auto* dst = static_cast<uint8_t*>(dest);
+        dst += y * stride + x;
+        const int ystep_base = ystep * (x + 1);
+
+        DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
+                                ystep_base, ystep, upsample_shift);
+        x += 8;
+      } while (x < width);
+      y += 8;
+    } while (y < height);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const int a_weight, const int b_weight) {
+  const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+  const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16_t a_weight,
+                                const uint16_t b_weight) {
+  const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16x8_t a_weight,
+                                const uint16x8_t b_weight) {
+  const uint16x8_t a_product = vmulq_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest,
+                         const uint16_t* LIBGAV1_RESTRICT const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2_u16(source);
+  } else {
+    dest->val[0] = vld1_u16(source);
+    dest->val[1] = vld1_u16(source + 1);
+  }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest,
+                         const uint16_t* LIBGAV1_RESTRICT const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2q_u16(source);
+  } else {
+    dest->val[0] = vld1q_u16(source);
+    dest->val[1] = vld1q_u16(source + 1);
+  }
+}
+
+// For Wx4 blocks, load the source for 2 columns. The source for the second
+// column is held in the high half of each vector.
+inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
+                            const uint16_t* LIBGAV1_RESTRICT const source_low,
+                            const uint16_t* LIBGAV1_RESTRICT const source_high,
+                            const bool upsampled) {
+  if (upsampled) {
+    const uint16x4x2_t low = vld2_u16(source_low);
+    const uint16x4x2_t high = vld2_u16(source_high);
+    dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
+    dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
+  } else {
+    dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
+    dest->val[1] =
+        vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (4 + height - 1) << upsample_shift;
+  const int16x4_t max_base = vdup_n_s16(max_base_x);
+  const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+  const int16x4_t index_offset = {0, 1, 2, 3};
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+    const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+    uint16x4x2_t sampled_top_row;
+    LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+    const uint16x4_t combined = WeightedBlend(
+        sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+    // If |upsampled| is true then extract every other value for output.
+    const uint16x4_t masked_result =
+        vbsl_u16(max_base_mask, combined, final_top_val);
+
+    vst1_u16(dst, masked_result);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], 4 /* width */);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    do {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+
+      base_x = vaddq_s16(base_x, block_step);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+  for (int i = y; i < height; ++i) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t stride, const int width,
+                                   const int height,
+                                   const uint16_t* LIBGAV1_RESTRICT const top,
+                                   const int xstep, const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    int x = 0;
+    do {
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      vst1q_u16(dst + x, combined);
+
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+        ~7;
+    for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+                                  base_x = vaddq_s16(base_x, block_step)) {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+    }
+    // Corner-only section of the row.
+    Memset(dst + x, top[max_base_index], width - x);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+void DirectionalIntraPredictorZone1_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  auto* dst = static_cast<uint16_t*>(dest);
+  stride /= sizeof(top[0]);
+
+  assert(xstep > 0);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint16_t* top_ptr = top + 1;
+    const int width_bytes = width * sizeof(top[0]);
+    int y = height;
+    do {
+      memcpy(dst, top_ptr, width_bytes);
+      memcpy(dst + stride, top_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+  } else {
+    if (width == 4) {
+      if (upsampled_top) {
+        DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+      } else {
+        DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+      }
+    } else if (width >= 32) {
+      if (upsampled_top) {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+      } else {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+      }
+    } else if (upsampled_top) {
+      DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+    } else {
+      DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30             00 01 02 03
+// 01 11 21 31             10 11 12 13
+// 02 12 22 32             20 21 22 23
+// 03 13 23 33             30 31 32 33
+// -----------     -->     -----------
+// 40 50 60 70             40 41 42 43
+// 41 51 61 71             50 51 52 53
+// 42 52 62 72             60 61 62 63
+// 43 53 63 73             70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x4_t result[4];
+
+  int left_y = base_left_y + ystep;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  uint16x4x2_t sampled_left_col;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose4x4(result);
+  Store4(dst, result[0]);
+  dst += stride;
+  Store4(dst, result[1]);
+  dst += stride;
+  Store4(dst, result[2]);
+  dst += stride;
+  Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const uint16x8_t inverter = vdupq_n_u16(32);
+
+  uint16x8x2_t sampled_left_col;
+  // Compute two columns at a time, then transpose for storage.
+  uint16x8_t result[4];
+
+  // The low half of pre-transpose vectors contains columns 0 through 3.
+  int left_y_low = base_left_y + ystep;
+  int left_offset_low = left_y_low >> index_scale_bits;
+  int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  // The high half of pre-transpose vectors contains columns 4 through 7.
+  int left_y_high = left_y_low + (ystep << 2);
+  int left_offset_high = left_y_high >> index_scale_bits;
+  int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  uint16x8_t weights_0 =
+      vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  left_y_low += ystep;
+  left_offset_low = left_y_low >> index_scale_bits;
+  shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  left_y_high += ystep;
+  left_offset_high = left_y_high >> index_scale_bits;
+  shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  left_y_high += ystep;
+  left_y_low += ystep;
+  left_offset_low = left_y_low >> index_scale_bits;
+  shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  left_offset_high = left_y_high >> index_scale_bits;
+  shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  left_y_low += ystep;
+  left_offset_low = left_y_low >> index_scale_bits;
+  shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  left_y_high += ystep;
+  left_offset_high = left_y_high >> index_scale_bits;
+  shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  Transpose4x8(result);
+  Store8(dst, result[0]);
+  dst += stride;
+  Store8(dst, result[1]);
+  dst += stride;
+  Store8(dst, result[2]);
+  dst += stride;
+  Store8(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x8_t result[4];
+
+  int left_y = base_left_y + ystep;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  uint16x8x2_t sampled_left_col;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose4x8(result);
+  Store4(dst, vget_low_u16(result[0]));
+  dst += stride;
+  Store4(dst, vget_low_u16(result[1]));
+  dst += stride;
+  Store4(dst, vget_low_u16(result[2]));
+  dst += stride;
+  Store4(dst, vget_low_u16(result[3]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[0]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[1]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[2]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[3]));
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep) {
+  assert(height == 8 || height == 16);
+  const int upsample_shift = static_cast<int>(upsampled);
+  DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
+  if (height == 16) {
+    dest += stride << 3;
+    DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
+                                    ystep);
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int width,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep) {
+  assert(width <= 16);
+  if (width == 4) {
+    DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
+    return;
+  }
+  DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
+  if (width == 16) {
+    const int base_left_y = ystep << 3;
+    DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
+                                    ystep, base_left_y);
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x8_t result[8];
+
+  int left_y = base_left_y + ystep;
+  uint16x8x2_t sampled_left_col;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose8x8(result);
+  Store8(dest, result[0]);
+  dest += stride;
+  Store8(dest, result[1]);
+  dest += stride;
+  Store8(dest, result[2]);
+  dest += stride;
+  Store8(dest, result[3]);
+  dest += stride;
+  Store8(dest, result[4]);
+  dest += stride;
+  Store8(dest, result[5]);
+  dest += stride;
+  Store8(dest, result[6]);
+  dest += stride;
+  Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> (6 - upsample_shift)) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+  int y = 0;
+  do {
+    int x = 0;
+    uint8_t* dst_x = dest + y * stride;
+    do {
+      const int base_left_y = ystep * x;
+      DirectionalZone3_8x8<upsampled>(
+          dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+      dst_x += 8 * sizeof(uint16_t);
+      x += 8;
+    } while (x < width);
+    y += 8;
+  } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (ystep == 64) {
+    assert(!upsampled_left);
+    const int width_bytes = width * sizeof(left[0]);
+    int y = height;
+    do {
+      const uint16_t* left_ptr = left + 1;
+      memcpy(dst, left_ptr, width_bytes);
+      memcpy(dst + stride, left_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      left_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+    return;
+  }
+  if (height == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+    } else {
+      DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+    }
+  } else if (width == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+    } else {
+      DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+    }
+  } else {
+    if (upsampled_left) {
+      // |upsampled_left| can only be true if |width| + |height| <= 16,
+      // therefore this is 8x8.
+      DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+    } else {
+      DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Zone2
+// This function deals with cases not found in zone 1 or zone 3. The extreme
+// angles are 93, which makes for sharp ascents along |left_column| with each
+// successive dest row element until reaching |top_row|, and 177, with a shallow
+// ascent up |left_column| until reaching large jumps along |top_row|. In the
+// extremely steep cases, source vectors can only be loaded one lane at a time.
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step, const uint8x8_t right_step,
+                         uint16x4_t* left, uint16x4_t* right) {
+  const uint8x16x2_t mixed = {
+      vld1q_u8(static_cast<const uint8_t*>(source)),
+      vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+  *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step));
+  *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step));
+}
+
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step_0,
+                         const uint8x8_t right_step_0,
+                         const uint8x8_t left_step_1,
+                         const uint8x8_t right_step_1, uint16x8_t* left,
+                         uint16x8_t* right) {
+  const uint8x16x2_t mixed = {
+      vld1q_u8(static_cast<const uint8_t*>(source)),
+      vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+  const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0));
+  const uint16x4_t left_high =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1));
+  *left = vcombine_u16(left_low, left_high);
+  const uint16x4_t right_low =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0));
+  const uint16x4_t right_high =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1));
+  *right = vcombine_u16(right_low, right_high);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const uint16x4_t a_weight,
+                                const uint16x4_t b_weight) {
+  const uint16x4_t a_product = vmul_u16(a, a_weight);
+  const uint16x4_t sum = vmla_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative in localized functions.
+// This is accommodated by making sure the relative indices are within [-15, 0]
+// when the function is called, and sliding them into the inclusive range
+// [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is
+// the byte offset for table lookups.
+
+constexpr int kPositiveIndexOffsetPixels = 15;
+constexpr int kPositiveIndexOffsetBytes = 30;
+
+inline void DirectionalZone2FromLeftCol_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y,
+    const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+
+  const int index_scale_bits = 6;
+  // The values in |offset_y| are negative, except for the first element, which
+  // is zero.
+  int16x4_t offset_y;
+  int16x4_t shift_upsampled = left_y;
+  // The shift argument must be a constant, otherwise use upsample_shift
+  // directly.
+  if (upsampled) {
+    offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/);
+    shift_upsampled = vshl_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshr_n_s16(left_y, index_scale_bits);
+  }
+  offset_y = vshl_n_s16(offset_y, 1);
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the
+  // right. With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by
+  // doing separate loads for |left_values| and |right_values|. vtbl
+  // supports 2 Q registers as input which would allow for cumulative
+  // offsets of 32.
+  // |sampler_0| indexes the first byte of each 16-bit value.
+  const int16x4_t sampler_0 =
+      vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes));
+  // |sampler_1| indexes the second byte of each 16-bit value.
+  const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1));
+  const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1);
+  const uint8x8_t left_indices =
+      vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1]));
+  const uint8x8_t right_indices =
+      vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t)));
+
+  const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f));
+  const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1));
+  const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0);
+
+  int y = 0;
+  do {
+    uint16x4_t src_left, src_right;
+    LoadStepwise(
+        left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+        left_indices, right_indices, &src_left, &src_right);
+    const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+    Store4(dst, val);
+    dst += stride;
+  } while (++y < height);
+}
+
+inline void DirectionalZone2FromLeftCol_8x8(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+    const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+
+  const int index_scale_bits = 6;
+  // The values in |offset_y| are negative, except for the first element, which
+  // is zero.
+  int16x8_t offset_y;
+  int16x8_t shift_upsampled = left_y;
+  // The shift argument must be a constant, otherwise use upsample_shift
+  // directly.
+  if (upsampled) {
+    offset_y = vshrq_n_s16(left_y, index_scale_bits - 1);
+    shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshrq_n_s16(left_y, index_scale_bits);
+  }
+  offset_y = vshlq_n_s16(offset_y, 1);
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the right.
+  // With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by doing
+  // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+  // registers as input which would allow for cumulative offsets of 32.
+  // |sampler_0| indexes the first byte of each 16-bit value.
+  const int16x8_t sampler_0 =
+      vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes));
+  // |sampler_1| indexes the second byte of each 16-bit value.
+  const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1));
+  const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1);
+  const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]);
+  const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]);
+  const uint8x8_t right_values_0 =
+      vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t)));
+  const uint8x8_t right_values_1 =
+      vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t)));
+
+  const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+  const uint16x8_t shift_0 =
+      vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
+  const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
+
+  for (int y = 0; y < 8; ++y) {
+    uint16x8_t src_left, src_right;
+    LoadStepwise(
+        left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+        left_values_0, right_values_0, left_values_1, right_values_1, &src_left,
+        &src_right);
+    const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+    Store8(dst, val);
+    dst += stride;
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_4xH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  // Representing positions along the row, which |zone_bounds| will target for
+  // the blending boundary.
+  const int16x4_t indices = {0, 1, 2, 3};
+
+  uint16x4x2_t top_vals;
+  int y = height;
+  do {
+    const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+    LoadEdgeVals(&top_vals, src, upsampled);
+
+    const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    const uint16x4_t val =
+        WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+    const uint16x4_t dst_blend = Load4U16(dest);
+    // |zone_bounds| values can be negative.
+    const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6));
+    const uint16x4_t output = vbsl_u16(blend, val, dst_blend);
+
+    Store4(dest, output);
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (--y != 0);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_8x8(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  // Representing positions along the row, which |zone_bounds| will target for
+  // the blending boundary.
+  const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  uint16x8x2_t top_vals;
+  for (int y = 0; y < 8; ++y) {
+    const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+    LoadEdgeVals(&top_vals, src, upsampled);
+
+    const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    const uint16x8_t val =
+        WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+    const uint16x8_t dst_blend = Load8U16(dest);
+    // |zone_bounds| values can be negative.
+    const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6));
+    const uint16x8_t output = vbslq_u16(blend, val, dst_blend);
+
+    Store8(dest, output);
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+
+  // Helper vector for index computation.
+  const int16x4_t zero_to_three = {0, 1, 2, 3};
+
+  // Loop increments for moving by block (4xN). Vertical still steps by 8. If
+  // it's only 4, it will be finished in the first iteration.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. The following values need the full ystep as a relative offset.
+  const int16x4_t left_y =
+      vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep);
+
+  // This loop treats the 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  // Round down to the nearest multiple of 8 (or 4, if height is 4).
+  const int max_top_only_y =
+      std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+  DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
+                                      stride >> 1, max_top_only_y, top_row,
+                                      -xstep);
+
+  if (max_top_only_y == height) return;
+
+  int y = max_top_only_y;
+  dst += stride * y;
+  const int xstep_y = xstep * y;
+
+  // All rows from |min_left_only_y| down for this set of columns only need
+  // |left_column| to compute.
+  const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height);
+  int xstep_bounds = xstep_bounds_base + xstep_y;
+  int top_x = -xstep - xstep_y;
+
+  // +8 increment is OK because if height is 4 this only runs once.
+  for (; y < min_left_only_y;
+       y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+    DirectionalZone2FromLeftCol_4xH(
+        dst, stride, min_height,
+        left_column + ((y - left_base_increment) << upsample_left_shift),
+        left_y, upsampled_left);
+
+    DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row,
+                                             xstep_bounds, top_x, xstep);
+  }
+
+  // Left-only section. |height| - |y| is assumed equivalent to:
+  // (y == 0) && (height == 4)
+  if (height - y == 4) {
+    DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
+    return;
+  }
+  if (y < height) {
+    DirectionalZone3_4xH<upsampled_left>(
+        dst, stride, height - y, left_column + (y << upsample_left_shift),
+        -ystep);
+  }
+}
+
+// Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies
+// address safety.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_Wx4(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int xstep, const int ystep) {
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int min_top_only_x = std::min((4 * xstep) >> 6, width);
+  int x = 0;
+  for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) {
+    uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+    // Round down to the nearest multiple of 4.
+    const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3;
+    if (max_top_only_y != 0) {
+      DirectionalZone1_4xH<upsampled_top>(
+          reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4,
+          top_row + (x << upsample_top_shift), -xstep);
+      continue;
+    }
+
+    DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep,
+                                         -ystep * x);
+
+    const int min_left_only_y = ((x + 4) << 6) / xstep;
+    if (min_left_only_y != 0) {
+      const int top_x = -xstep;
+      DirectionalZone1Blend_4xH<upsampled_top>(
+          dst_x, stride, 4, top_row + (x << upsample_top_shift),
+          xstep_bounds_base, top_x, xstep);
+    }
+  }
+  // Reached |min_top_only_x|.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4,
+        top_row + (x << upsample_top_shift), -xstep);
+  }
+}
+
+template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8xH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const int x, const int left_offset,
+    const int xstep_bounds_base, const int16x8_t left_y) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  uint8_t* dst_x = dst + x * sizeof(uint16_t);
+  // Round down to the nearest multiple of 8.
+  const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+  DirectionalZone1_WxH<upsampled_top>(
+      reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+      top_row + (x << upsample_top_shift), -xstep);
+
+  if (max_top_only_y == height) return;
+
+  int y = max_top_only_y;
+  dst_x += stride * y;
+  const int xstep_y = xstep * y;
+
+  // All rows from |min_left_only_y| down for this set of columns only need
+  // |left_column| to compute. Round up to the nearest 8.
+  const int min_left_only_y =
+      Align(std::min(((x + 8) << 6) / xstep, height), 8);
+  int xstep_bounds = xstep_bounds_base + xstep_y;
+  int top_x = -xstep - xstep_y;
+
+  for (; y < min_left_only_y;
+       y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+    if (shuffle_left_column) {
+      DirectionalZone2FromLeftCol_8x8(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsampled_left);
+    } else {
+      DirectionalZone3_8x8<upsampled_left>(
+          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+          -ystep * x);
+    }
+
+    DirectionalZone1Blend_8x8<upsampled_top>(
+        dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
+        xstep);
+  }
+
+  // Loop over y for left_only rows.
+  for (; y < height; y += 8, dst_x += stride8) {
+    DirectionalZone3_8x8<upsampled_left>(
+        dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+        -ystep * x);
+  }
+}
+
+// Process a multiple of 8 |width|.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_NEON(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep) {
+  if (height == 4) {
+    DirectionalZone2_Wx4<upsampled_top, upsampled_left>(
+        dst, stride, top_row, left_column, width, xstep, ystep);
+    return;
+  }
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int ystep8 = ystep << 3;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
+  // Analysis finds that, for most angles (ystep < 132), all segments that use
+  // both top_row and left_column can compute from left_column using byte
+  // shuffles from a single vector. For steeper angles, the shuffle is also
+  // fully reliable when x >= 32.
+  const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+  const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  int16x8_t left_y =
+      vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
+
+  int x = 0;
+  for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_bounds_base, left_y);
+  }
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_bounds_base, left_y);
+  }
+  // Reached |min_top_only_x|.
+  if (x < width) {
+    DirectionalZone1_WxH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height,
+        top_row + (x << upsample_top_shift), -xstep);
+  }
+}
+
+// At this angle, neither edges are upsampled.
+// |min_width| is either 4 or 8.
+template <int min_width>
+void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+                         const uint16_t* LIBGAV1_RESTRICT const top,
+                         const uint16_t* LIBGAV1_RESTRICT const left,
+                         const int width, const int height) {
+  // y = 0 is more trivial than the other rows.
+  memcpy(dst, top - 1, width * sizeof(top[0]));
+  dst += stride;
+
+  // If |height| > |width|, then there is a point at which top_row is no longer
+  // used in each row.
+  const int min_left_only_y = std::min(width, height);
+
+  int y = 1;
+  do {
+    // Example: If y is 4 (min_width), the dest row starts with left[3],
+    // left[2], left[1], left[0], because the angle points up. Therefore, load
+    // starts at left[0] and is then reversed. If y is 2, the load starts at
+    // left[-2], and is reversed to store left[1], left[0], with negative values
+    // overwritten from |top_row|.
+    const uint16_t* const load_left = left + y - min_width;
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+
+    // Some values will be overwritten when |y| is not a multiple of
+    // |min_width|.
+    if (min_width == 4) {
+      const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left));
+      vst1_u16(dst16, left_toward_corner);
+    } else {
+      int x = 0;
+      do {
+        const uint16x8_t left_toward_corner =
+            vrev64q_u16(vld1q_u16(load_left - x));
+        vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+        vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+        x += 8;
+      } while (x < y);
+    }
+    // Entering |top|.
+    memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0]));
+    dst += stride;
+  } while (++y < min_left_only_y);
+
+  // Left only.
+  for (; y < height; ++y, dst += stride) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t* const load_left = left + y - min_width;
+
+    int x = 0;
+    if (min_width == 4) {
+      const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x));
+      vst1_u16(dst16 + x, left_toward_corner);
+    } else {
+      do {
+        const uint16x8_t left_toward_corner =
+            vrev64q_u16(vld1q_u16(load_left - x));
+        vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+        vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+        x += 8;
+      } while (x < width);
+    }
+  }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+    void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint16_t top_buffer[288];
+  uint16_t left_buffer[288];
+#if LIBGAV1_MSAN
+  memset(top_buffer, 0, sizeof(top_buffer));
+  memset(left_buffer, 0, sizeof(left_buffer));
+#endif  // LIBGAV1_MSAN
+  memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16,
+         160);
+  const uint16_t* top_ptr = top_buffer + 144;
+  const uint16_t* left_ptr = left_buffer + 144;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (width == 4) {
+    if (xstep == 64) {
+      assert(ystep == 64);
+      DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height);
+      return;
+    }
+    if (upsampled_top) {
+      if (upsampled_left) {
+        DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height,
+                                         xstep, ystep);
+      } else {
+        DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr,
+                                          height, xstep, ystep);
+      }
+    } else if (upsampled_left) {
+      DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height,
+                                        xstep, ystep);
+    } else {
+      DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height,
+                                         xstep, ystep);
+    }
+    return;
+  }
+
+  if (xstep == 64) {
+    assert(ystep == 64);
+    DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height);
+    return;
+  }
+  if (upsampled_top) {
+    if (upsampled_left) {
+      DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
+                                        height, xstep, ystep);
+    } else {
+      DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
+                                         height, xstep, ystep);
+    }
+  } else if (upsampled_left) {
+    DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
+                                       height, xstep, ystep);
+  } else {
+    DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
+                                        height, xstep, ystep);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644 (file)
index 0000000..310d90b
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
diff --git a/src/dsp/arm/intrapred_filter_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc
new file mode 100644 (file)
index 0000000..70bd62b
--- /dev/null
@@ -0,0 +1,306 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Transpose kFilterIntraTaps and convert the first row to unsigned values.
+//
+// With the previous orientation we were able to multiply all the input values
+// by a single tap. This required that all the input values be in one vector
+// which requires expensive set up operations (shifts, vext, vtbl). All the
+// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but
+// then the shifting, rounding, and clamping was done in GP registers.
+//
+// Switching to unsigned values allows multiplying the 8 bit inputs directly.
+// When one value was negative we needed to vmovl_u8 first so that the results
+// maintained the proper sign.
+//
+// We take this into account when summing the values by subtracting the product
+// of the first row.
+alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] =
+    {{{6, 5, 3, 3, 4, 3, 3, 3},  // Original values are negative.
+      {10, 2, 1, 1, 6, 2, 2, 1},
+      {0, 10, 1, 1, 0, 6, 2, 2},
+      {0, 0, 10, 2, 0, 0, 6, 2},
+      {0, 0, 0, 10, 0, 0, 0, 6},
+      {12, 9, 7, 5, 2, 2, 2, 3},
+      {0, 0, 0, 0, 12, 9, 7, 5}},
+     {{10, 6, 4, 2, 10, 6, 4, 2},  // Original values are negative.
+      {16, 0, 0, 0, 16, 0, 0, 0},
+      {0, 16, 0, 0, 0, 16, 0, 0},
+      {0, 0, 16, 0, 0, 0, 16, 0},
+      {0, 0, 0, 16, 0, 0, 0, 16},
+      {10, 6, 4, 2, 0, 0, 0, 0},
+      {0, 0, 0, 0, 10, 6, 4, 2}},
+     {{8, 8, 8, 8, 4, 4, 4, 4},  // Original values are negative.
+      {8, 0, 0, 0, 4, 0, 0, 0},
+      {0, 8, 0, 0, 0, 4, 0, 0},
+      {0, 0, 8, 0, 0, 0, 4, 0},
+      {0, 0, 0, 8, 0, 0, 0, 4},
+      {16, 16, 16, 16, 0, 0, 0, 0},
+      {0, 0, 0, 0, 16, 16, 16, 16}},
+     {{2, 1, 1, 0, 1, 1, 1, 1},  // Original values are negative.
+      {8, 3, 2, 1, 4, 3, 2, 2},
+      {0, 8, 3, 2, 0, 4, 3, 2},
+      {0, 0, 8, 3, 0, 0, 4, 3},
+      {0, 0, 0, 8, 0, 0, 0, 4},
+      {10, 6, 4, 2, 3, 4, 4, 3},
+      {0, 0, 0, 0, 10, 6, 4, 3}},
+     {{12, 10, 9, 8, 10, 9, 8, 7},  // Original values are negative.
+      {14, 0, 0, 0, 12, 1, 0, 0},
+      {0, 14, 0, 0, 0, 12, 0, 0},
+      {0, 0, 14, 0, 0, 0, 12, 1},
+      {0, 0, 0, 14, 0, 0, 0, 12},
+      {14, 12, 11, 10, 0, 0, 1, 1},
+      {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column,
+                               FilterIntraPredictor pred, int width,
+                               int height) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x8_t transposed_taps[7];
+  for (int i = 0; i < 7; ++i) {
+    transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]);
+  }
+
+  uint8_t relative_top_left = top[-1];
+  const uint8_t* relative_top = top;
+  uint8_t relative_left[2] = {left[0], left[1]};
+
+  int y = 0;
+  do {
+    uint8_t* row_dst = dst;
+    int x = 0;
+    do {
+      uint16x8_t sum = vdupq_n_u16(0);
+      const uint16x8_t subtrahend =
+          vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left));
+      for (int i = 1; i < 5; ++i) {
+        sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1]));
+      }
+      for (int i = 5; i < 7; ++i) {
+        sum =
+            vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5]));
+      }
+
+      const int16x8_t sum_signed =
+          vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend));
+      const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4);
+
+      uint8x8_t sum_saturated = vqmovun_s16(sum_shifted);
+
+      StoreLo4(row_dst, sum_saturated);
+      StoreHi4(row_dst + stride, sum_saturated);
+
+      // Progress across
+      relative_top_left = relative_top[3];
+      relative_top += 4;
+      relative_left[0] = row_dst[3];
+      relative_left[1] = row_dst[3 + stride];
+      row_dst += 4;
+      x += 4;
+    } while (x < width);
+
+    // Progress down.
+    relative_top_left = left[y + 1];
+    relative_top = dst + stride;
+    relative_left[0] = left[y + 2];
+    relative_left[1] = left[y + 3];
+
+    dst += 2 * stride;
+    y += 2;
+  } while (y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+alignas(kMaxAlignment) constexpr int16_t
+    kTransposedTaps[kNumFilterIntraPredictors][7][8] = {
+        {{-6, -5, -3, -3, -4, -3, -3, -3},
+         {10, 2, 1, 1, 6, 2, 2, 1},
+         {0, 10, 1, 1, 0, 6, 2, 2},
+         {0, 0, 10, 2, 0, 0, 6, 2},
+         {0, 0, 0, 10, 0, 0, 0, 6},
+         {12, 9, 7, 5, 2, 2, 2, 3},
+         {0, 0, 0, 0, 12, 9, 7, 5}},
+        {{-10, -6, -4, -2, -10, -6, -4, -2},
+         {16, 0, 0, 0, 16, 0, 0, 0},
+         {0, 16, 0, 0, 0, 16, 0, 0},
+         {0, 0, 16, 0, 0, 0, 16, 0},
+         {0, 0, 0, 16, 0, 0, 0, 16},
+         {10, 6, 4, 2, 0, 0, 0, 0},
+         {0, 0, 0, 0, 10, 6, 4, 2}},
+        {{-8, -8, -8, -8, -4, -4, -4, -4},
+         {8, 0, 0, 0, 4, 0, 0, 0},
+         {0, 8, 0, 0, 0, 4, 0, 0},
+         {0, 0, 8, 0, 0, 0, 4, 0},
+         {0, 0, 0, 8, 0, 0, 0, 4},
+         {16, 16, 16, 16, 0, 0, 0, 0},
+         {0, 0, 0, 0, 16, 16, 16, 16}},
+        {{-2, -1, -1, -0, -1, -1, -1, -1},
+         {8, 3, 2, 1, 4, 3, 2, 2},
+         {0, 8, 3, 2, 0, 4, 3, 2},
+         {0, 0, 8, 3, 0, 0, 4, 3},
+         {0, 0, 0, 8, 0, 0, 0, 4},
+         {10, 6, 4, 2, 3, 4, 4, 3},
+         {0, 0, 0, 0, 10, 6, 4, 3}},
+        {{-12, -10, -9, -8, -10, -9, -8, -7},
+         {14, 0, 0, 0, 12, 1, 0, 0},
+         {0, 14, 0, 0, 0, 12, 0, 0},
+         {0, 0, 14, 0, 0, 0, 12, 1},
+         {0, 0, 0, 14, 0, 0, 0, 12},
+         {14, 12, 11, 10, 0, 0, 1, 1},
+         {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column,
+                               FilterIntraPredictor pred, int width,
+                               int height) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  auto* dst = static_cast<uint16_t*>(dest);
+
+  stride >>= 1;
+
+  int16x8_t transposed_taps[7];
+  for (int i = 0; i < 7; ++i) {
+    transposed_taps[i] = vld1q_s16(kTransposedTaps[pred][i]);
+  }
+
+  uint16_t relative_top_left = top[-1];
+  const uint16_t* relative_top = top;
+  uint16_t relative_left[2] = {left[0], left[1]};
+
+  int y = 0;
+  do {
+    uint16_t* row_dst = dst;
+    int x = 0;
+    do {
+      int16x8_t sum =
+          vmulq_s16(transposed_taps[0],
+                    vreinterpretq_s16_u16(vdupq_n_u16(relative_top_left)));
+      for (int i = 1; i < 5; ++i) {
+        sum =
+            vmlaq_s16(sum, transposed_taps[i],
+                      vreinterpretq_s16_u16(vdupq_n_u16(relative_top[i - 1])));
+      }
+      for (int i = 5; i < 7; ++i) {
+        sum =
+            vmlaq_s16(sum, transposed_taps[i],
+                      vreinterpretq_s16_u16(vdupq_n_u16(relative_left[i - 5])));
+      }
+
+      const int16x8_t sum_shifted = vrshrq_n_s16(sum, 4);
+      const uint16x8_t sum_saturated = vminq_u16(
+          vreinterpretq_u16_s16(vmaxq_s16(sum_shifted, vdupq_n_s16(0))),
+          vdupq_n_u16((1 << kBitdepth10) - 1));
+
+      vst1_u16(row_dst, vget_low_u16(sum_saturated));
+      vst1_u16(row_dst + stride, vget_high_u16(sum_saturated));
+
+      // Progress across
+      relative_top_left = relative_top[3];
+      relative_top += 4;
+      relative_left[0] = row_dst[3];
+      relative_left[1] = row_dst[3 + stride];
+      row_dst += 4;
+      x += 4;
+    } while (x < width);
+
+    // Progress down.
+    relative_top_left = left[y + 1];
+    relative_top = dst + stride;
+    relative_left[0] = left[y + 2];
+    relative_left[1] = left[y + 3];
+
+    dst += 2 * stride;
+    y += 2;
+  } while (y < height);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredFilterInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644 (file)
index 0000000..d005f4c
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
new file mode 100644 (file)
index 0000000..d1adbdf
--- /dev/null
@@ -0,0 +1,1622 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_NEON
+
+using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
+                                 const bool use_ref_1, const void* ref_1,
+                                 const int ref_1_size_log2);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
+
+// DC intra-predictors for square blocks.
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+struct DcPredFuncs_NEON {
+  DcPredFuncs_NEON() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+};
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+    DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+          const void* LIBGAV1_RESTRICT const top_row,
+          const void* /*left_column*/) {
+  const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
+  const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+    DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+           const void* /*top_row*/,
+           const void* LIBGAV1_RESTRICT const left_column) {
+  const uint32x2_t sum =
+      sumfn(left_column, block_height_log2, false, nullptr, 0);
+  const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint32x2_t sum =
+      sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
+  if (block_width_log2 == block_height_log2) {
+    const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
+    storefn(dest, stride, dc);
+  } else {
+    // TODO(johannkoenig): Compare this to mul/shift in vectors.
+    const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
+    uint32_t dc = vget_lane_u32(sum, 0);
+    dc += divisor >> 1;
+    dc /= divisor;
+    storefn(dest, stride, vdup_n_u32(dc));
+  }
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x4_t val) {
+  const uint32x2_t sum = vpaddl_u16(val);
+  return vpadd_u32(sum, sum);
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x8_t val) {
+  const uint32x4_t sum_0 = vpaddlq_u16(val);
+  const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
+  return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
+                  vget_high_u32(vreinterpretq_u32_u64(sum_1)));
+}
+
+}  // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
+// entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
+  const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+  const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
+// the entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
+                      const uint8x16_t val_2, const uint8x16_t val_3) {
+  const uint16x8_t sum_0 = Add(val_0, val_1);
+  const uint16x8_t sum_1 = Add(val_2, val_3);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 32 uint8_t values.
+inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
+  const uint8x16_t val_0 = vld1q_u8(buf);
+  const uint8x16_t val_1 = vld1q_u8(buf + 16);
+  return Add(val_0, val_1);
+}
+
+// Load and combine 64 uint8_t values.
+inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
+  const uint8x16_t val_0 = vld1q_u8(buf);
+  const uint8x16_t val_1 = vld1q_u8(buf + 16);
+  const uint8x16_t val_2 = vld1q_u8(buf + 32);
+  const uint8x16_t val_3 = vld1q_u8(buf + 48);
+  return Add(val_0, val_1, val_2, val_3);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+// For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
+// uint32_t.
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+                             const int ref_0_size_log2, const bool use_ref_1,
+                             const void* LIBGAV1_RESTRICT ref_1,
+                             const int ref_1_size_log2) {
+  const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
+  const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
+  if (ref_0_size_log2 == 2) {
+    uint8x8_t val = Load4(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 4x4
+          val = Load4<1>(ref_1_u8, val);
+          return Sum(vpaddl_u8(val));
+        }
+        case 3: {  // 4x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          const uint16x4_t sum_0 = vpaddl_u8(val);
+          const uint16x4_t sum_1 = vpaddl_u8(val_1);
+          return Sum(vadd_u16(sum_0, sum_1));
+        }
+        case 4: {  // 4x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
+        }
+      }
+    }
+    // 4x1
+    const uint16x4_t sum = vpaddl_u8(val);
+    return vpaddl_u16(sum);
+  }
+  if (ref_0_size_log2 == 3) {
+    const uint8x8_t val_0 = vld1_u8(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 8x4
+          const uint8x8_t val_1 = Load4(ref_1_u8);
+          const uint16x4_t sum_0 = vpaddl_u8(val_0);
+          const uint16x4_t sum_1 = vpaddl_u8(val_1);
+          return Sum(vadd_u16(sum_0, sum_1));
+        }
+        case 3: {  // 8x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          const uint16x4_t sum_0 = vpaddl_u8(val_0);
+          const uint16x4_t sum_1 = vpaddl_u8(val_1);
+          return Sum(vadd_u16(sum_0, sum_1));
+        }
+        case 4: {  // 8x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
+        }
+        case 5: {  // 8x32
+          return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
+        }
+      }
+    }
+    // 8x1
+    return Sum(vpaddl_u8(val_0));
+  }
+  if (ref_0_size_log2 == 4) {
+    const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 16x4
+          const uint8x8_t val_1 = Load4(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+        }
+        case 3: {  // 16x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+        }
+        case 4: {  // 16x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          return Sum(Add(val_0, val_1));
+        }
+        case 5: {  // 16x32
+          const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 16x64
+          const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 16x1
+    return Sum(vpaddlq_u8(val_0));
+  }
+  if (ref_0_size_log2 == 5) {
+    const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 3: {  // 32x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          return Sum(vaddw_u8(sum_0, val_1));
+        }
+        case 4: {  // 32x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 5: {  // 32x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 32x64
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 32x1
+    return Sum(sum_0);
+  }
+
+  assert(ref_0_size_log2 == 6);
+  const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
+  if (use_ref_1) {
+    switch (ref_1_size_log2) {
+      case 4: {  // 64x16
+        const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+        const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 5: {  // 64x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 6: {  // 64x64
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+  }
+  // 64x1
+  return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+                         const uint32x2_t dc) {
+  const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    int i = height - 1;
+    do {
+      StoreLo4(dst, vget_low_u8(dc_dup));
+      dst += stride;
+    } while (--i != 0);
+    StoreLo4(dst, vget_low_u8(dc_dup));
+  } else if (width == 8) {
+    int i = height - 1;
+    do {
+      vst1_u8(dst, vget_low_u8(dc_dup));
+      dst += stride;
+    } while (--i != 0);
+    vst1_u8(dst, vget_low_u8(dc_dup));
+  } else if (width == 16) {
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+  } else if (width == 32) {
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      vst1q_u8(dst + 16, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
+  } else {
+    assert(width == 64);
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      vst1q_u8(dst + 16, dc_dup);
+      vst1q_u8(dst + 32, dc_dup);
+      vst1q_u8(dst + 48, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
+    vst1q_u8(dst + 32, dc_dup);
+    vst1q_u8(dst + 48, dc_dup);
+  }
+}
+
+template <int width, int height>
+inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const top_row,
+                             const void* LIBGAV1_RESTRICT const left_column) {
+  auto* dest_u8 = static_cast<uint8_t*>(dest);
+  const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+  const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+  const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+  uint8x8_t top;
+  if (width == 4) {
+    top = Load4(top_row_u8);
+  } else {  // width == 8
+    top = vld1_u8(top_row_u8);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
+
+    const uint8x8_t left_dist = vabd_u8(top, top_left);
+    const uint8x8_t top_dist = vabd_u8(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+    const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+    const uint8x8_t left_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+    const uint8x8_t top_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint8x8_t result = vbsl_u8(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u8(left_or_top_mask, result, top_left);
+
+    if (width == 4) {
+      StoreLo4(dest_u8, result);
+    } else {  // width == 8
+      vst1_u8(dest_u8, result);
+    }
+    dest_u8 += stride;
+  }
+}
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
+                             const uint16x8_t top_left_dist_low,
+                             const uint16x8_t top_left_dist_high) {
+  const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+                                               vqmovn_u16(top_left_dist_high));
+  return vcleq_u8(x_dist, top_left_dist);
+}
+
+// Select the closest values and collect them.
+inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
+                              const uint8x16_t top_left,
+                              const uint8x16_t left_le_top,
+                              const uint8x16_t left_le_top_left,
+                              const uint8x16_t top_le_top_left) {
+  // if (left_dist <= top_dist && left_dist <= top_left_dist)
+  const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+  //   dest[x] = left_column[y];
+  // Fill all the unused spaces with 'top'. They will be overwritten when
+  // the positions for top_left are known.
+  uint8x16_t result = vbslq_u8(left_mask, left, top);
+  // else if (top_dist <= top_left_dist)
+  //   dest[x] = top_row[x];
+  // Add these values to the mask. They were already set.
+  const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+  // else
+  //   dest[x] = top_left;
+  return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num)                                              \
+  const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
+      vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+  const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
+      vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num)                                  \
+  const uint8x16_t left_le_top_left_##num =                    \
+      XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
+                 top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num)                           \
+  const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
+      top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+template <int width, int height>
+inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  auto* dest_u8 = static_cast<uint8_t*>(dest);
+  const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+  const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+  const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+  uint8x16_t top[4];
+  top[0] = vld1q_u8(top_row_u8);
+  if (width > 16) {
+    top[1] = vld1q_u8(top_row_u8 + 16);
+    if (width == 64) {
+      top[2] = vld1q_u8(top_row_u8 + 32);
+      top[3] = vld1q_u8(top_row_u8 + 48);
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
+
+    const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+    const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+    TOP_LEFT_DIST(0);
+    const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+    LEFT_LE_TOP_LEFT(0);
+    TOP_LE_TOP_LEFT(0);
+
+    const uint8x16_t result_0 =
+        SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+                    top_le_top_left_0);
+    vst1q_u8(dest_u8, result_0);
+
+    if (width > 16) {
+      const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+      TOP_LEFT_DIST(1);
+      const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+      LEFT_LE_TOP_LEFT(1);
+      TOP_LE_TOP_LEFT(1);
+
+      const uint8x16_t result_1 =
+          SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
+                      top_le_top_left_1);
+      vst1q_u8(dest_u8 + 16, result_1);
+
+      if (width == 64) {
+        const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+        TOP_LEFT_DIST(2);
+        const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+        LEFT_LE_TOP_LEFT(2);
+        TOP_LE_TOP_LEFT(2);
+
+        const uint8x16_t result_2 =
+            SelectPaeth(top[2], left, top_left, left_2_le_top,
+                        left_le_top_left_2, top_le_top_left_2);
+        vst1q_u8(dest_u8 + 32, result_2);
+
+        const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+        TOP_LEFT_DIST(3);
+        const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+        LEFT_LE_TOP_LEFT(3);
+        TOP_LE_TOP_LEFT(3);
+
+        const uint8x16_t result_3 =
+            SelectPaeth(top[3], left, top_left, left_3_le_top,
+                        left_le_top_left_3, top_le_top_left_3);
+        vst1q_u8(dest_u8 + 48, result_3);
+      }
+    }
+
+    dest_u8 += stride;
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+  using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+  using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+  using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+  using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+  using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+  using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+  using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+  using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+  using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+  using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+  using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+  using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+  using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+  using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+  using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+  using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+  using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+  using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Add the elements in the given vectors together but do not sum the entire
+// vector.
+inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
+                      const uint16x8_t val_2, const uint16x8_t val_3) {
+  const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
+  const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 16 uint16_t values.
+inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  return vaddq_u16(val_0, val_1);
+}
+
+// Load and combine 32 uint16_t values.
+inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  const uint16x8_t val_2 = vld1q_u16(buf + 16);
+  const uint16x8_t val_3 = vld1q_u16(buf + 24);
+  return Add(val_0, val_1, val_2, val_3);
+}
+
+// Load and combine 64 uint16_t values.
+inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  const uint16x8_t val_2 = vld1q_u16(buf + 16);
+  const uint16x8_t val_3 = vld1q_u16(buf + 24);
+  const uint16x8_t val_4 = vld1q_u16(buf + 32);
+  const uint16x8_t val_5 = vld1q_u16(buf + 40);
+  const uint16x8_t val_6 = vld1q_u16(buf + 48);
+  const uint16x8_t val_7 = vld1q_u16(buf + 56);
+  const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
+  const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+                             const int ref_0_size_log2, const bool use_ref_1,
+                             const void* LIBGAV1_RESTRICT ref_1,
+                             const int ref_1_size_log2) {
+  const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
+  const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
+  if (ref_0_size_log2 == 2) {
+    const uint16x4_t val_0 = vld1_u16(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 4x4
+          const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+          return Sum(vadd_u16(val_0, val_1));
+        }
+        case 3: {  // 4x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+          return Sum(vaddq_u16(sum_0, val_1));
+        }
+        case 4: {  // 4x16
+          const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 4x1
+    return Sum(val_0);
+  }
+  if (ref_0_size_log2 == 3) {
+    const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 8x4
+          const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+          const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+          return Sum(vaddq_u16(val_0, sum_1));
+        }
+        case 3: {  // 8x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          return Sum(vaddq_u16(val_0, val_1));
+        }
+        case 4: {  // 8x16
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(val_0, sum_1));
+        }
+        case 5: {  // 8x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+          return Sum(vaddq_u16(val_0, sum_1));
+        }
+      }
+    }
+    // 8x1
+    return Sum(val_0);
+  }
+  if (ref_0_size_log2 == 4) {
+    const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 16x4
+          const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+          const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 3: {  // 16x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, val_1));
+        }
+        case 4: {  // 16x16
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 5: {  // 16x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 16x64
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 16x1
+    return Sum(sum_0);
+  }
+  if (ref_0_size_log2 == 5) {
+    const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 3: {  // 32x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, val_1));
+        }
+        case 4: {  // 32x16
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 5: {  // 32x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 32x64
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 32x1
+    return Sum(sum_0);
+  }
+
+  assert(ref_0_size_log2 == 6);
+  const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
+  if (use_ref_1) {
+    switch (ref_1_size_log2) {
+      case 4: {  // 64x16
+        const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 5: {  // 64x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 6: {  // 64x64
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+  }
+  // 64x1
+  return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+                         const uint32x2_t dc) {
+  auto* dest_u16 = static_cast<uint16_t*>(dest);
+  ptrdiff_t stride_u16 = stride >> 1;
+  const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
+  if (width == 4) {
+    int i = height - 1;
+    do {
+      vst1_u16(dest_u16, vget_low_u16(dc_dup));
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1_u16(dest_u16, vget_low_u16(dc_dup));
+  } else if (width == 8) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+  } else if (width == 16) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+  } else if (width == 32) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      vst1q_u16(dest_u16 + 16, dc_dup);
+      vst1q_u16(dest_u16 + 24, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+    vst1q_u16(dest_u16 + 16, dc_dup);
+    vst1q_u16(dest_u16 + 24, dc_dup);
+  } else {
+    assert(width == 64);
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      vst1q_u16(dest_u16 + 16, dc_dup);
+      vst1q_u16(dest_u16 + 24, dc_dup);
+      vst1q_u16(dest_u16 + 32, dc_dup);
+      vst1q_u16(dest_u16 + 40, dc_dup);
+      vst1q_u16(dest_u16 + 48, dc_dup);
+      vst1q_u16(dest_u16 + 56, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+    vst1q_u16(dest_u16 + 16, dc_dup);
+    vst1q_u16(dest_u16 + 24, dc_dup);
+    vst1q_u16(dest_u16 + 32, dc_dup);
+    vst1q_u16(dest_u16 + 40, dc_dup);
+    vst1q_u16(dest_u16 + 48, dc_dup);
+    vst1q_u16(dest_u16 + 56, dc_dup);
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+  using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+  using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+  using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+  using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+  using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+  using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+  using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+  using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+  using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+  using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+  using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+  using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+  using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+  using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+  using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+  using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+  using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+  using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t row = vld1_dup_u16(left + y);
+    vst1_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t row = vld1q_dup_u16(left + y);
+    vst1q_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    vst1q_u16(dst16 + 16, row0);
+    vst1q_u16(dst16 + 24, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    vst1q_u16(dst16 + 16, row1);
+    vst1q_u16(dst16 + 24, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x8_t row = vld1_u8(top);
+  int y = block_height;
+  do {
+    vst1_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row = vld1q_u8(top);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  const uint8x16_t row4 = vld1q_u8(top + 64);
+  const uint8x16_t row5 = vld1q_u8(top + 80);
+  const uint8x16_t row6 = vld1q_u8(top + 96);
+  const uint8x16_t row7 = vld1q_u8(top + 112);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int height>
+inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
+  const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
+  const uint16x4_t top = vld1_u16(top_row);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t left = vdup_n_u16(left_col[y]);
+
+    const uint16x4_t left_dist = vabd_u16(top, top_left);
+    const uint16x4_t top_dist = vabd_u16(left, top_left);
+    const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
+
+    const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
+    const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
+    const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x4_t result = vbsl_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u16(left_or_top_mask, result, top_left);
+
+    vst1_u16(dst16, result);
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+  const uint16x8_t top = vld1q_u16(top_row);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t left = vdupq_n_u16(left_col[y]);
+
+    const uint16x8_t left_dist = vabdq_u16(top, top_left);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x8_t result = vbslq_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbslq_u16(left_or_top_mask, result, top_left);
+
+    vst1q_u16(dst16, result);
+    dst += stride;
+  }
+}
+
+// For 16xH and above.
+template <int width, int height>
+inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+
+  uint16x8_t top[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    top[i] = vld1q_u16(top_row + (i << 3));
+  }
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t left = vdupq_n_u16(left_col[y]);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+
+    for (int i = 0; i < (width >> 3); ++i) {
+      const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
+      const uint16x8_t top_left_dist =
+          vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
+
+      const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+      const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+      const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+      // if (left_dist <= top_dist && left_dist <= top_left_dist)
+      const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+      //   dest[x] = left_column[y];
+      // Fill all the unused spaces with 'top'. They will be overwritten when
+      // the positions for top_left are known.
+      uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
+      // else if (top_dist <= top_left_dist)
+      //   dest[x] = top_row[x];
+      // Add these values to the mask. They were already set.
+      const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+      // else
+      //   dest[x] = top_left;
+      result = vbslq_u16(left_or_top_mask, result, top_left);
+
+      vst1q_u16(dst_x, result);
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Vertical4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Vertical4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Vertical4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Vertical8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Vertical8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Vertical8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Vertical8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Vertical16xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Horizontal16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Vertical16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Vertical16xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Vertical16xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Vertical16xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Vertical32xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Vertical32xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Vertical32xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Horizontal32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Vertical32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Vertical64xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Vertical64xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Vertical64xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
new file mode 100644 (file)
index 0000000..5a56924
--- /dev/null
@@ -0,0 +1,323 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
+void IntraPredInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 10 bit
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
new file mode 100644 (file)
index 0000000..d6c1450
--- /dev/null
@@ -0,0 +1,1139 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+// 256 - v = vneg_s8(v)
+inline uint8x8_t NegateS8(const uint8x8_t v) {
+  return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
+}
+
+template <int height>
+void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
+  constexpr int width = 4;
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_v = Load4(top);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
+  const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+    const uint16x8_t weighted_top_bl =
+        vmlal_u8(weighted_bl, weights_y_v, top_v);
+    const uint16x8_t weighted_left_tr =
+        vmlal_u8(weighted_tr, weights_x_v, left_v);
+    // Maximum value of each parameter: 0xFF00
+    const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+    const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
+
+    StoreLo4(dst, result);
+    dst += stride;
+  }
+}
+
+inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
+                               const uint16x8_t weighted_left_tr) {
+  // Maximum value of each parameter: 0xFF00
+  const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+  return vrshrn_n_u16(avg, kSmoothWeightScale);
+}
+
+inline uint8x8_t CalculateWeightsAndPred(
+    const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+    const uint8x8_t bottom_left, const uint8x8_t weights_x,
+    const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+  const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+  const uint16x8_t weighted_top_bl =
+      vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+  const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+  return CalculatePred(weighted_top_bl, weighted_left_tr);
+}
+
+template <int height>
+void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
+  constexpr int width = 8;
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_v = vld1_u8(top);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
+  const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+    const uint8x8_t result =
+        CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
+                                weights_x_v, scaled_weights_y, weights_y_v);
+
+    vst1_u8(dst, result);
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateWeightsAndPred(
+    const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+    const uint8x8_t weights_y, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+  const uint16x8_t weighted_top_bl_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_low =
+      CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
+
+  const uint16x8_t weighted_top_bl_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_high =
+      CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
+
+  return vcombine_u8(result_low, result_high);
+}
+
+// 256 - v = vneg_s8(v)
+inline uint8x16_t NegateS8(const uint8x16_t v) {
+  return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
+template <int width, int height>
+void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                         const void* LIBGAV1_RESTRICT const top_row,
+                         const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x16_t top_v[4];
+  top_v[0] = vld1q_u8(top);
+  if (width > 16) {
+    top_v[1] = vld1q_u8(top + 16);
+    if (width == 64) {
+      top_v[2] = vld1q_u8(top + 32);
+      top_v[3] = vld1q_u8(top + 48);
+    }
+  }
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  uint8x16_t weights_x_v[4];
+  weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
+  if (width > 16) {
+    weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+    if (width == 64) {
+      weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+      weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+    }
+  }
+
+  uint8x16_t scaled_weights_x[4];
+  scaled_weights_x[0] = NegateS8(weights_x_v[0]);
+  if (width > 16) {
+    scaled_weights_x[1] = NegateS8(weights_x_v[1]);
+    if (width == 64) {
+      scaled_weights_x[2] = NegateS8(weights_x_v[2]);
+      scaled_weights_x[3] = NegateS8(weights_x_v[3]);
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
+                                          weights_y_v, weights_x_v[0],
+                                          scaled_weights_x[0], weighted_bl));
+
+    if (width > 16) {
+      vst1q_u8(dst + 16, CalculateWeightsAndPred(
+                             top_v[1], left_v, top_right_v, weights_y_v,
+                             weights_x_v[1], scaled_weights_x[1], weighted_bl));
+      if (width == 64) {
+        vst1q_u8(dst + 32,
+                 CalculateWeightsAndPred(top_v[2], left_v, top_right_v,
+                                         weights_y_v, weights_x_v[2],
+                                         scaled_weights_x[2], weighted_bl));
+        vst1q_u8(dst + 48,
+                 CalculateWeightsAndPred(top_v[3], left_v, top_right_v,
+                                         weights_y_v, weights_x_v[3],
+                                         scaled_weights_x[3], weighted_bl));
+      }
+    }
+
+    dst += stride;
+  }
+}
+
+template <int width, int height>
+void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x8_t top_v;
+  if (width == 4) {
+    top_v = Load4(top);
+  } else {  // width == 8
+    top_v = vld1_u8(top);
+  }
+
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+
+    const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+    const uint16x8_t weighted_top_bl =
+        vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
+    const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
+
+    if (width == 4) {
+      StoreLo4(dst, pred);
+    } else {  // width == 8
+      vst1_u8(dst, pred);
+    }
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateVerticalWeightsAndPred(
+    const uint8x16_t top, const uint8x8_t weights_y,
+    const uint16x8_t weighted_bl) {
+  const uint16x8_t pred_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t pred_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+  const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, kSmoothWeightScale);
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+void SmoothVertical16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x16_t top_v[4];
+  top_v[0] = vld1q_u8(top);
+  if (width > 16) {
+    top_v[1] = vld1q_u8(top + 16);
+    if (width == 64) {
+      top_v[2] = vld1q_u8(top + 32);
+      top_v[3] = vld1q_u8(top + 48);
+    }
+  }
+
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    const uint8x16_t pred_0 =
+        CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl);
+    vst1q_u8(dst, pred_0);
+
+    if (width > 16) {
+      const uint8x16_t pred_1 =
+          CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl);
+      vst1q_u8(dst + 16, pred_1);
+
+      if (width == 64) {
+        const uint8x16_t pred_2 =
+            CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl);
+        vst1q_u8(dst + 32, pred_2);
+
+        const uint8x16_t pred_3 =
+            CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl);
+        vst1q_u8(dst + 48, pred_3);
+      }
+    }
+
+    dst += stride;
+  }
+}
+
+template <int width, int height>
+void SmoothHorizontal4Or8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  // Over-reads for 4xN but still within the array.
+  const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
+  const uint8x8_t scaled_weights_x = NegateS8(weights_x);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint16x8_t weighted_left_tr =
+        vmlal_u8(weighted_tr, weights_x, left_v);
+    const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
+
+    if (width == 4) {
+      StoreLo4(dst, pred);
+    } else {  // width == 8
+      vst1_u8(dst, pred);
+    }
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateHorizontalWeightsAndPred(
+    const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x) {
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t pred_scaled_low =
+      vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
+
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
+
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+void SmoothHorizontal16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+
+  uint8x16_t weights_x[4];
+  weights_x[0] = vld1q_u8(kSmoothWeights + width - 4);
+  if (width > 16) {
+    weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+    if (width == 64) {
+      weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+      weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+    }
+  }
+
+  uint8x16_t scaled_weights_x[4];
+  scaled_weights_x[0] = NegateS8(weights_x[0]);
+  if (width > 16) {
+    scaled_weights_x[1] = NegateS8(weights_x[1]);
+    if (width == 64) {
+      scaled_weights_x[2] = NegateS8(weights_x[2]);
+      scaled_weights_x[3] = NegateS8(weights_x[3]);
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+    const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred(
+        left_v, top_right_v, weights_x[0], scaled_weights_x[0]);
+    vst1q_u8(dst, pred_0);
+
+    if (width > 16) {
+      const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred(
+          left_v, top_right_v, weights_x[1], scaled_weights_x[1]);
+      vst1q_u8(dst + 16, pred_1);
+
+      if (width == 64) {
+        const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred(
+            left_v, top_right_v, weights_x[2], scaled_weights_x[2]);
+        vst1q_u8(dst + 32, pred_2);
+
+        const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred(
+            left_v, top_right_v, weights_x[3], scaled_weights_x[3]);
+        vst1q_u8(dst + 48, pred_3);
+      }
+    }
+    dst += stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4xN_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4xN_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4xN_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint16_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+// 256 - v = vneg_s8(v)
+inline uint16x4_t NegateS8(const uint16x4_t v) {
+  return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
+template <int height>
+void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[3];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_v = vld1_u16(top);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
+  const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+  for (int y = 0; y < height; ++y) {
+    // Each variable in the running summation is named for the last item to be
+    // accumulated.
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_top, weights_x_v, left[y]);
+    const uint32x4_t weighted_bl =
+        vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+    const uint16x4_t pred = vrshrn_n_u32(weighted_bl, kSmoothWeightScale + 1);
+    vst1_u16(reinterpret_cast<uint16_t*>(dst), pred);
+    dst += stride;
+  }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
+                           const uint32x4_t weighted_corners_low,
+                           const uint32x4_t weighted_corners_high,
+                           const uint16x4x2_t top_vals,
+                           const uint16x4x2_t weights_x, const uint16_t left_y,
+                           const uint16_t weight_y) {
+  // Each variable in the running summation is named for the last item to be
+  // accumulated.
+  const uint32x4_t weighted_top_low =
+      vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+  const uint32x4_t weighted_edges_low =
+      vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+  const uint16x4_t pred_low =
+      vrshrn_n_u32(weighted_edges_low, kSmoothWeightScale + 1);
+  vst1_u16(dst, pred_low);
+
+  const uint32x4_t weighted_top_high =
+      vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+  const uint32x4_t weighted_edges_high =
+      vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+  const uint16x4_t pred_high =
+      vrshrn_n_u32(weighted_edges_high, kSmoothWeightScale + 1);
+  vst1_u16(dst + 4, pred_high);
+}
+
+template <int height>
+void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[7];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4x2_t top_vals = {vld1_u16(top), vld1_u16(top + 4)};
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+                                  vld1_u16(kSmoothWeights + 8)};
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_corners_low =
+        vaddq_u32(weighted_bl, weighted_tr_low);
+    const uint32x4_t weighted_corners_high =
+        vaddq_u32(weighted_bl, weighted_tr_high);
+    CalculatePred8(reinterpret_cast<uint16_t*>(dst), weighted_corners_low,
+                   weighted_corners_high, top_vals, weights_x, left[y],
+                   weights_y[y]);
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[width - 1];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  // Precompute weighted values that don't vary with |y|.
+  uint32x4_t weighted_tr_low[width >> 3];
+  uint32x4_t weighted_tr_high[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
+    weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
+    const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
+    weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
+  }
+
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    for (int i = 0; i < width >> 3; ++i) {
+      const int x = i << 3;
+      const uint16x4x2_t top_vals = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+      const uint32x4_t weighted_corners_low =
+          vaddq_u32(weighted_bl, weighted_tr_low[i]);
+      const uint32x4_t weighted_corners_high =
+          vaddq_u32(weighted_bl, weighted_tr_high[i]);
+      // Accumulate weighted edge values and store.
+      const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + width - 4 + x),
+                                      vld1_u16(kSmoothWeights + width + x)};
+      CalculatePred8(dst_x, weighted_corners_low, weighted_corners_high,
+                     top_vals, weights_x, left[y], weights_y[y]);
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+template <int height>
+void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_v = vld1_u16(top);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_top, kSmoothWeightScale));
+
+    dst += stride;
+  }
+}
+
+template <int height>
+void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_low = vld1_u16(top);
+  const uint16x4_t top_high = vld1_u16(top + 4);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    const uint32x4_t weighted_top_low =
+        vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+    const uint32x4_t weighted_top_high =
+        vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+    vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint16x4x2_t top_vals[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    top_vals[i] = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+  }
+
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    for (int i = 0; i < width >> 3; ++i) {
+      const uint32x4_t weighted_top_low =
+          vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);
+      vst1_u16(dst_x, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+      const uint32x4_t weighted_top_high =
+          vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);
+      vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+template <int height>
+void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[3];
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
+  const uint16x4_t scaled_weights_x = NegateS8(weights_x);
+
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_tr, weights_x, left[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_left, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+template <int height>
+void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[7];
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+                                  vld1_u16(kSmoothWeights + 8)};
+
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t left_y = left[y];
+    const uint32x4_t weighted_left_low =
+        vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+    const uint32x4_t weighted_left_high =
+        vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+    vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[width - 1];
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint16x4_t weights_x_low[width >> 3];
+  uint16x4_t weights_x_high[width >> 3];
+  uint32x4_t weighted_tr_low[width >> 3];
+  uint32x4_t weighted_tr_high[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
+    weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
+    weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
+    weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t left_y = left[y];
+    for (int i = 0; i < width >> 3; ++i) {
+      const uint32x4_t weighted_left_low =
+          vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);
+      vst1_u16(dst_x, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+      const uint32x4_t weighted_left_high =
+          vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);
+      vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredSmoothInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644 (file)
index 0000000..28b5bd5
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+// 10bpp
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644 (file)
index 0000000..e6f0d9d
--- /dev/null
@@ -0,0 +1,2797 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+                                        int32x4_t out[4]) {
+  // in:
+  // 00 01 02 03
+  // 10 11 12 13
+  // 20 21 22 23
+  // 30 31 32 33
+
+  // 00 10 02 12   a.val[0]
+  // 01 11 03 13   a.val[1]
+  // 20 30 22 32   b.val[0]
+  // 21 31 23 33   b.val[1]
+  const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+  const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+  out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+  out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+  out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+  out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+  // out:
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* LIBGAV1_RESTRICT dst,
+                                    int32_t stride, int32_t idx,
+                                    const int32x4_t* const s) {
+  assert(store_count % 4 == 0);
+  for (int i = 0; i < store_count; i += 4) {
+    vst1q_s32(&dst[i * stride + idx], s[i]);
+    vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+    vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+    vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+  }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* LIBGAV1_RESTRICT src,
+                                   int32_t stride, int32_t idx, int32x4_t* x) {
+  assert(load_count % 4 == 0);
+  for (int i = 0; i < load_count; i += 4) {
+    x[i] = vld1q_s32(&src[i * stride + idx]);
+    x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+    x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+    x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+  const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+  // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+  // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+  // bit lane.
+  const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+  const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+                                                         int32x4_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  assert(sin128 <= 0xfff);
+  const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+  const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+                                                          int32x4_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+  const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip, const int32x4_t min,
+                                            const int32x4_t max) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = vmaxq_s32(vminq_s32(x, max), min);
+  *b = vmaxq_s32(vminq_s32(y, max), min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+  const int32_t cos128 = Cos128(32);
+  const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+  // vqrshlq_s32 will shift right if shift value is negative.
+  const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+  // Clamp result to signed 16 bits.
+  const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+  if (width == 4) {
+    vst1q_s32(dst, result);
+  } else {
+    for (int i = 0; i < width; i += 4) {
+      vst1q_s32(dst, result);
+      dst += 4;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int32x4_t v_src = vld1q_s32(dst);
+    const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+    vst1q_s32(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&dst[i]);
+      const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+      vst1q_s32(&dst[i], xy);
+      i += 4;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t min,
+                                      const int32x4_t max,
+                                      const bool is_last_stage) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[3], false);
+    HadamardRotation(&s[1], &s[2], false);
+  } else {
+    HadamardRotation(&s[0], &s[3], false, min, max);
+    HadamardRotation(&s[1], &s[2], false, min, max);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  // When |is_row| is true, set range to the row range, otherwise, set to the
+  // column range.
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[4], x[4];
+
+  if (is_row) {
+    assert(step == 4);
+    int32x4x4_t y = vld4q_s32(dst);
+    for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+  } else {
+    LoadSrc<4>(dst, step, 0, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : s) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    int32x4x4_t y;
+    for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+    vst4q_s32(dst, y);
+  } else {
+    StoreDst<4>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t min,
+                                      const int32x4_t max,
+                                      const bool is_last_stage) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false, min, max);
+  HadamardRotation(&s[6], &s[7], true, min, max);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[7], false);
+    HadamardRotation(&s[1], &s[6], false);
+    HadamardRotation(&s[2], &s[5], false);
+    HadamardRotation(&s[3], &s[4], false);
+  } else {
+    HadamardRotation(&s[0], &s[7], false, min, max);
+    HadamardRotation(&s[1], &s[6], false, min, max);
+    HadamardRotation(&s[2], &s[5], false, min, max);
+    HadamardRotation(&s[3], &s[4], false, min, max);
+  }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : s) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    Transpose4x4(&s[0], &s[0]);
+    Transpose4x4(&s[4], &s[4]);
+    StoreDst<4>(dst, step, 0, &s[0]);
+    StoreDst<4>(dst, step, 4, &s[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t min,
+                                       const int32x4_t max,
+                                       const bool is_last_stage) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false, min, max);
+  HadamardRotation(&s[10], &s[11], true, min, max);
+  HadamardRotation(&s[12], &s[13], false, min, max);
+  HadamardRotation(&s[14], &s[15], true, min, max);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false, min, max);
+  HadamardRotation(&s[9], &s[10], false, min, max);
+  HadamardRotation(&s[12], &s[15], true, min, max);
+  HadamardRotation(&s[13], &s[14], true, min, max);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[15], false);
+    HadamardRotation(&s[1], &s[14], false);
+    HadamardRotation(&s[2], &s[13], false);
+    HadamardRotation(&s[3], &s[12], false);
+    HadamardRotation(&s[4], &s[11], false);
+    HadamardRotation(&s[5], &s[10], false);
+    HadamardRotation(&s[6], &s[9], false);
+    HadamardRotation(&s[7], &s[8], false);
+  } else {
+    HadamardRotation(&s[0], &s[15], false, min, max);
+    HadamardRotation(&s[1], &s[14], false, min, max);
+    HadamardRotation(&s[2], &s[13], false, min, max);
+    HadamardRotation(&s[3], &s[12], false, min, max);
+    HadamardRotation(&s[4], &s[11], false, min, max);
+    HadamardRotation(&s[5], &s[10], false, min, max);
+    HadamardRotation(&s[6], &s[9], false, min, max);
+    HadamardRotation(&s[7], &s[8], false, min, max);
+  }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+  Dct16Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : s) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&s[idx], &s[idx]);
+      Transpose4x4(&s[idx + 4], &s[idx + 4]);
+      StoreDst<4>(dst, step, idx, &s[idx]);
+      StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t min,
+                                       const int32x4_t max,
+                                       const bool is_last_stage) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false, min, max);
+  HadamardRotation(&s[18], &s[19], true, min, max);
+  HadamardRotation(&s[20], &s[21], false, min, max);
+  HadamardRotation(&s[22], &s[23], true, min, max);
+  HadamardRotation(&s[24], &s[25], false, min, max);
+  HadamardRotation(&s[26], &s[27], true, min, max);
+  HadamardRotation(&s[28], &s[29], false, min, max);
+  HadamardRotation(&s[30], &s[31], true, min, max);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false, min, max);
+  HadamardRotation(&s[17], &s[18], false, min, max);
+  HadamardRotation(&s[20], &s[23], true, min, max);
+  HadamardRotation(&s[21], &s[22], true, min, max);
+  HadamardRotation(&s[24], &s[27], false, min, max);
+  HadamardRotation(&s[25], &s[26], false, min, max);
+  HadamardRotation(&s[28], &s[31], true, min, max);
+  HadamardRotation(&s[29], &s[30], true, min, max);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false, min, max);
+  HadamardRotation(&s[17], &s[22], false, min, max);
+  HadamardRotation(&s[18], &s[21], false, min, max);
+  HadamardRotation(&s[19], &s[20], false, min, max);
+  HadamardRotation(&s[24], &s[31], true, min, max);
+  HadamardRotation(&s[25], &s[30], true, min, max);
+  HadamardRotation(&s[26], &s[29], true, min, max);
+  HadamardRotation(&s[27], &s[28], true, min, max);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[31], false);
+    HadamardRotation(&s[1], &s[30], false);
+    HadamardRotation(&s[2], &s[29], false);
+    HadamardRotation(&s[3], &s[28], false);
+    HadamardRotation(&s[4], &s[27], false);
+    HadamardRotation(&s[5], &s[26], false);
+    HadamardRotation(&s[6], &s[25], false);
+    HadamardRotation(&s[7], &s[24], false);
+    HadamardRotation(&s[8], &s[23], false);
+    HadamardRotation(&s[9], &s[22], false);
+    HadamardRotation(&s[10], &s[21], false);
+    HadamardRotation(&s[11], &s[20], false);
+    HadamardRotation(&s[12], &s[19], false);
+    HadamardRotation(&s[13], &s[18], false);
+    HadamardRotation(&s[14], &s[17], false);
+    HadamardRotation(&s[15], &s[16], false);
+  } else {
+    HadamardRotation(&s[0], &s[31], false, min, max);
+    HadamardRotation(&s[1], &s[30], false, min, max);
+    HadamardRotation(&s[2], &s[29], false, min, max);
+    HadamardRotation(&s[3], &s[28], false, min, max);
+    HadamardRotation(&s[4], &s[27], false, min, max);
+    HadamardRotation(&s[5], &s[26], false, min, max);
+    HadamardRotation(&s[6], &s[25], false, min, max);
+    HadamardRotation(&s[7], &s[24], false, min, max);
+    HadamardRotation(&s[8], &s[23], false, min, max);
+    HadamardRotation(&s[9], &s[22], false, min, max);
+    HadamardRotation(&s[10], &s[21], false, min, max);
+    HadamardRotation(&s[11], &s[20], false, min, max);
+    HadamardRotation(&s[12], &s[19], false, min, max);
+    HadamardRotation(&s[13], &s[18], false, min, max);
+    HadamardRotation(&s[14], &s[17], false, min, max);
+    HadamardRotation(&s[15], &s[16], false, min, max);
+  }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (auto& o : output) {
+        o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<32>(dst, step, 0, &s[0]);
+  }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false, min, max);
+  HadamardRotation(&s[34], &s[35], true, min, max);
+  HadamardRotation(&s[36], &s[37], false, min, max);
+  HadamardRotation(&s[38], &s[39], true, min, max);
+  HadamardRotation(&s[40], &s[41], false, min, max);
+  HadamardRotation(&s[42], &s[43], true, min, max);
+  HadamardRotation(&s[44], &s[45], false, min, max);
+  HadamardRotation(&s[46], &s[47], true, min, max);
+  HadamardRotation(&s[48], &s[49], false, min, max);
+  HadamardRotation(&s[50], &s[51], true, min, max);
+  HadamardRotation(&s[52], &s[53], false, min, max);
+  HadamardRotation(&s[54], &s[55], true, min, max);
+  HadamardRotation(&s[56], &s[57], false, min, max);
+  HadamardRotation(&s[58], &s[59], true, min, max);
+  HadamardRotation(&s[60], &s[61], false, min, max);
+  HadamardRotation(&s[62], &s[63], true, min, max);
+
+  // stage 7.
+  ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false, min, max);
+  HadamardRotation(&s[33], &s[34], false, min, max);
+  HadamardRotation(&s[36], &s[39], true, min, max);
+  HadamardRotation(&s[37], &s[38], true, min, max);
+  HadamardRotation(&s[40], &s[43], false, min, max);
+  HadamardRotation(&s[41], &s[42], false, min, max);
+  HadamardRotation(&s[44], &s[47], true, min, max);
+  HadamardRotation(&s[45], &s[46], true, min, max);
+  HadamardRotation(&s[48], &s[51], false, min, max);
+  HadamardRotation(&s[49], &s[50], false, min, max);
+  HadamardRotation(&s[52], &s[55], true, min, max);
+  HadamardRotation(&s[53], &s[54], true, min, max);
+  HadamardRotation(&s[56], &s[59], false, min, max);
+  HadamardRotation(&s[57], &s[58], false, min, max);
+  HadamardRotation(&s[60], &s[63], true, min, max);
+  HadamardRotation(&s[61], &s[62], true, min, max);
+
+  // stage 16.
+  ButterflyRotation_4(&s[61], &s[34], 56, true);
+  ButterflyRotation_4(&s[60], &s[35], 56, true);
+  ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false, min, max);
+  HadamardRotation(&s[33], &s[38], false, min, max);
+  HadamardRotation(&s[34], &s[37], false, min, max);
+  HadamardRotation(&s[35], &s[36], false, min, max);
+  HadamardRotation(&s[40], &s[47], true, min, max);
+  HadamardRotation(&s[41], &s[46], true, min, max);
+  HadamardRotation(&s[42], &s[45], true, min, max);
+  HadamardRotation(&s[43], &s[44], true, min, max);
+  HadamardRotation(&s[48], &s[55], false, min, max);
+  HadamardRotation(&s[49], &s[54], false, min, max);
+  HadamardRotation(&s[50], &s[53], false, min, max);
+  HadamardRotation(&s[51], &s[52], false, min, max);
+  HadamardRotation(&s[56], &s[63], true, min, max);
+  HadamardRotation(&s[57], &s[62], true, min, max);
+  HadamardRotation(&s[58], &s[61], true, min, max);
+  HadamardRotation(&s[59], &s[60], true, min, max);
+
+  // stage 25.
+  ButterflyRotation_4(&s[59], &s[36], 48, true);
+  ButterflyRotation_4(&s[58], &s[37], 48, true);
+  ButterflyRotation_4(&s[57], &s[38], 48, true);
+  ButterflyRotation_4(&s[56], &s[39], 48, true);
+  ButterflyRotation_4(&s[55], &s[40], 112, true);
+  ButterflyRotation_4(&s[54], &s[41], 112, true);
+  ButterflyRotation_4(&s[53], &s[42], 112, true);
+  ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false, min, max);
+  HadamardRotation(&s[33], &s[46], false, min, max);
+  HadamardRotation(&s[34], &s[45], false, min, max);
+  HadamardRotation(&s[35], &s[44], false, min, max);
+  HadamardRotation(&s[36], &s[43], false, min, max);
+  HadamardRotation(&s[37], &s[42], false, min, max);
+  HadamardRotation(&s[38], &s[41], false, min, max);
+  HadamardRotation(&s[39], &s[40], false, min, max);
+  HadamardRotation(&s[48], &s[63], true, min, max);
+  HadamardRotation(&s[49], &s[62], true, min, max);
+  HadamardRotation(&s[50], &s[61], true, min, max);
+  HadamardRotation(&s[51], &s[60], true, min, max);
+  HadamardRotation(&s[52], &s[59], true, min, max);
+  HadamardRotation(&s[53], &s[58], true, min, max);
+  HadamardRotation(&s[54], &s[57], true, min, max);
+  HadamardRotation(&s[55], &s[56], true, min, max);
+
+  // stage 30.
+  ButterflyRotation_4(&s[55], &s[40], 32, true);
+  ButterflyRotation_4(&s[54], &s[41], 32, true);
+  ButterflyRotation_4(&s[53], &s[42], 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 32, true);
+  ButterflyRotation_4(&s[50], &s[45], 32, true);
+  ButterflyRotation_4(&s[49], &s[46], 32, true);
+  ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false, min, max);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false, min, max);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false, min, max);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false, min, max);
+  }
+  //-- end dct 64 stages
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (auto& o : output) {
+        o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<64>(dst, step, 0, &s[0]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+  int32x4_t x[4];
+
+  if (is_row) {
+    assert(step == 4);
+    int32x4x4_t y = vld4q_s32(dst);
+    for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+  } else {
+    LoadSrc<4>(dst, step, 0, x);
+  }
+
+  // stage 1.
+  s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+  s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+  const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+  // stage 3.
+  s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+  s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+  s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  x[0] = vrshrq_n_s32(x0, 12);
+  x[1] = vrshrq_n_s32(x1, 12);
+  x[2] = vrshrq_n_s32(s[2], 12);
+  x[3] = vrshrq_n_s32(x3, 12);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+    x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+    x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+    x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+    int32x4x4_t y;
+    for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+    vst4q_s32(dst, y);
+  } else {
+    StoreDst<4>(dst, step, 0, x);
+  }
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                           2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[2];
+
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src0_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+  const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+  // vqrshlq_s32 will shift right if shift value is negative.
+  vst1q_s32(dst,
+            vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+    s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+    s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+    s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+    const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+    const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+    const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+    vst1q_s32(&dst[i], dst_0);
+    vst1q_s32(&dst[i + width * 1], dst_1);
+    vst1q_s32(&dst[i + width * 2], dst_2);
+    vst1q_s32(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false, min, max);
+  HadamardRotation(&s[1], &s[5], false, min, max);
+  HadamardRotation(&s[2], &s[6], false, min, max);
+  HadamardRotation(&s[3], &s[7], false, min, max);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false, min, max);
+  HadamardRotation(&s[4], &s[6], false, min, max);
+  HadamardRotation(&s[1], &s[3], false, min, max);
+  HadamardRotation(&s[5], &s[7], false, min, max);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : x) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+    StoreDst<4>(dst, step, 0, &x[0]);
+    StoreDst<4>(dst, step, 4, &x[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int32x4_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int32x4_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s32(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s32(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s32(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s32(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false, min, max);
+  HadamardRotation(&s[1], &s[9], false, min, max);
+  HadamardRotation(&s[2], &s[10], false, min, max);
+  HadamardRotation(&s[3], &s[11], false, min, max);
+  HadamardRotation(&s[4], &s[12], false, min, max);
+  HadamardRotation(&s[5], &s[13], false, min, max);
+  HadamardRotation(&s[6], &s[14], false, min, max);
+  HadamardRotation(&s[7], &s[15], false, min, max);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false, min, max);
+  HadamardRotation(&s[8], &s[12], false, min, max);
+  HadamardRotation(&s[1], &s[5], false, min, max);
+  HadamardRotation(&s[9], &s[13], false, min, max);
+  HadamardRotation(&s[2], &s[6], false, min, max);
+  HadamardRotation(&s[10], &s[14], false, min, max);
+  HadamardRotation(&s[3], &s[7], false, min, max);
+  HadamardRotation(&s[11], &s[15], false, min, max);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false, min, max);
+  HadamardRotation(&s[4], &s[6], false, min, max);
+  HadamardRotation(&s[8], &s[10], false, min, max);
+  HadamardRotation(&s[12], &s[14], false, min, max);
+  HadamardRotation(&s[1], &s[3], false, min, max);
+  HadamardRotation(&s[5], &s[7], false, min, max);
+  HadamardRotation(&s[9], &s[11], false, min, max);
+  HadamardRotation(&s[13], &s[15], false, min, max);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : x) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+      StoreDst<4>(dst, step, idx, &x[idx]);
+      StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[16];
+  int32x4_t x[16];
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int i = 0;
+  do {
+    int32x4_t s[16];
+    int32x4_t x[16];
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_mult_lo =
+        vmlaq_s32(v_dual_round, v_src, v_multiplier);
+    const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int32_t* LIBGAV1_RESTRICT source) {
+  static_assert(identity_size == 4 || identity_size == 8 ||
+                    identity_size == 16 || identity_size == 32,
+                "Invalid identity_size.");
+  const int stride = frame.columns();
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (identity_size < 32) {
+    if (tx_width == 4) {
+      int i = 0;
+      do {
+        int32x4x2_t v_src, v_dst_i, a, b;
+        v_src.val[0] = vld1q_s32(&source[i * 4]);
+        v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+        if (identity_size == 4) {
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        } else if (identity_size == 8) {
+          v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+          v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+          a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+          a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+        } else {  // identity_size == 16
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        }
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst);
+        frame_data.val[1] = vld1_u16(dst + stride);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        dst += stride << 1;
+        i += 2;
+      } while (i < tx_height);
+    } else {
+      int i = 0;
+      do {
+        const int row = i * tx_width;
+        int j = 0;
+        do {
+          int32x4x2_t v_src, v_dst_i, a, b;
+          v_src.val[0] = vld1q_s32(&source[row + j]);
+          v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+          if (identity_size == 4) {
+            v_dst_i.val[0] =
+                vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+            v_dst_i.val[1] =
+                vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+            a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+            a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+          } else if (identity_size == 8) {
+            v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+            v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+            a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+            a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+          } else {  // identity_size == 16
+            v_dst_i.val[0] =
+                vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+            v_dst_i.val[1] =
+                vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+            a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+            a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+          }
+          uint16x4x2_t frame_data;
+          frame_data.val[0] = vld1_u16(dst + j);
+          frame_data.val[1] = vld1_u16(dst + j + 4);
+          b.val[0] =
+              vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+          b.val[1] =
+              vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+          vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+          vst1_u16(dst + j + 4,
+                   vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+          j += 8;
+        } while (j < tx_width);
+        dst += stride;
+      } while (++i < tx_height);
+    }
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int32x4_t v_dst_i = vld1q_s32(&source[row + j]);
+        const uint16x4_t frame_data = vld1_u16(dst + j);
+        const int32x4_t a = vrshrq_n_s32(v_dst_i, 2);
+        const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+        const uint16x4_t d = vmin_u16(vqmovun_s32(b), v_max_bitdepth);
+        vst1_u16(dst + j, d);
+        j += 4;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int32_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+      const int32x4_t v_dst_row =
+          vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+      const int32x4_t v_dst_col =
+          vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+      const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        v_src_round.val[0] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+        v_src_round.val[1] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+        v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+        v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+        v_dst_col.val[0] =
+            vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+        v_dst_col.val[1] =
+            vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+        a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+    const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+    const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      int32x4x2_t v_src;
+      v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+      v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+      const int32x4_t v_src_mult_lo =
+          vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+      vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_src_mult_lo =
+      vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+                                                const int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 32; j += 4) {
+      const int32x4_t v_src = vld1q_s32(&dst[i * step + j]);
+      const int32x4_t v_dst_i = vqaddq_s32(v_src, v_src);
+      vst1q_s32(&dst[i * step + j], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x2_t v_src0 = vdup_n_s32(dst[0]);
+  const int32x2_t v_src =
+      vqrdmulh_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const int32x2_t v_dst_0 = vqadd_s32(v_src, v_src);
+  vst1_lane_s32(dst, v_dst_0, 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint16_t* LIBGAV1_RESTRICT dst,
+                                     const int dst_stride,
+                                     const void* LIBGAV1_RESTRICT source,
+                                     const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int32_t*>(source);
+  int32x4_t s[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int32_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int32_t g = f >> 1;
+    f = f - (f >> 1);
+    const int32_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int32_t i = (src[0] >> 4);
+    s[0] = vdupq_n_s32(h);
+    s[0] = vsetq_lane_s32(f, s[0], 0);
+    s[1] = vdupq_n_s32(i);
+    s[1] = vsetq_lane_s32(g, s[1], 0);
+    s[2] = s[3] = s[1];
+  } else {
+    // Load the 4x4 source in transposed form.
+    int32x4x4_t columns = vld4q_s32(src);
+
+    // Shift right and permute the columns for the WHT.
+    s[0] = vshrq_n_s32(columns.val[0], 2);
+    s[2] = vshrq_n_s32(columns.val[1], 2);
+    s[3] = vshrq_n_s32(columns.val[2], 2);
+    s[1] = vshrq_n_s32(columns.val[3], 2);
+
+    // Row transforms.
+    s[0] = vaddq_s32(s[0], s[2]);
+    s[3] = vsubq_s32(s[3], s[1]);
+    int32x4_t e = vhsubq_s32(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsubq_s32(e, s[1]);
+    s[2] = vsubq_s32(e, s[2]);
+    s[0] = vsubq_s32(s[0], s[1]);
+    s[3] = vaddq_s32(s[3], s[2]);
+
+    int32x4_t x[4];
+    Transpose4x4(s, x);
+
+    s[0] = x[0];
+    s[2] = x[1];
+    s[3] = x[2];
+    s[1] = x[3];
+
+    // Column transforms.
+    s[0] = vaddq_s32(s[0], s[2]);
+    s[3] = vsubq_s32(s[3], s[1]);
+    e = vhsubq_s32(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsubq_s32(e, s[1]);
+    s[2] = vsubq_s32(e, s[2]);
+    s[0] = vsubq_s32(s[0], s[1]);
+    s[3] = vaddq_s32(s[3], s[2]);
+  }
+
+  // Store to frame.
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  for (int row = 0; row < 4; row += 1) {
+    const uint16x4_t frame_data = vld1_u16(dst);
+    const int32x4_t b = vaddw_s16(s[row], vreinterpret_s16_u16(frame_data));
+    vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+    dst += dst_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      const int32x4_t c = vld1q_s32(&source[i + 8]);
+      const int32x4_t d = vld1q_s32(&source[i + 12]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      const int32x4_t c_rev = vrev64q_s32(c);
+      const int32x4_t d_rev = vrev64q_s32(d);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+      vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t a_lo = vld1q_s32(&source[i]);
+    const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+    const int32x4_t b_lo =
+        vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+    const int32x4_t b_hi =
+        vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+    vst1q_s32(&source[i], b_lo);
+    vst1q_s32(&source[i + 4], b_hi);
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s32 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t residual0 = vld1q_s32(&source[i]);
+    const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+    vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+    vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int32_t* LIBGAV1_RESTRICT source,
+    TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int32x4_t residual = vld1q_s32(&source[row]);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(residual, 4);
+      const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+      const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+      vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int32x4_t residual = vld1q_s32(&source[row + j]);
+        const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+        const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+        const int32x4_t a = vrshrq_n_s32(residual, 4);
+        const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+        const uint32x4_t b =
+            vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+        const uint32x4_t b_hi =
+            vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+        const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+        const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+        vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+                                          vdupq_n_u16((1 << kBitdepth10) - 1)));
+        j += 8;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+                                   row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct8 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+                                   row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct16 rows in parallel per iteration.
+    Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+    data += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct16 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct32 rows in parallel per iteration.
+    Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+    data += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<32>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct64 rows in parallel per iteration.
+    Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+    data += 128 * 2;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<64>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst8 rows in parallel per iteration.
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+                                    /*transpose=*/true, row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  do {
+    // Process 4 1d adst16 rows in parallel per iteration.
+    Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    int i = tx_width;
+    auto* data = src;
+    do {
+      // Process 4 1d adst16 columns in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                       /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  const int shift = tx_height > 8 ? 1 : 0;
+  int i = adjusted_tx_height;
+  do {
+    Identity4_NEON(src, /*step=*/4, shift);
+    src += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+  // bit value.
+  if ((tx_height & 0x18) != 0) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+      const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+      vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+      vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+    }
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16. The src is always rounded before the identity
+  // transform and shifted by 1 afterwards.
+  auto* src = static_cast<int32_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = adjusted_tx_height;
+  do {
+    Identity32Row16_NEON(src, /*step=*/32);
+    src += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+                               int /*adjusted_tx_height*/, void* /*src_buffer*/,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int32_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  uint16_t* dst = frame[start_y] + start_x;
+  const int dst_stride = frame.columns();
+  Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      Identity32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      Identity32TransformLoopColumn_NEON;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      Wht4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      Wht4TransformLoopColumn_NEON;
+}
+
+}  // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
new file mode 100644 (file)
index 0000000..cc4e4a4
--- /dev/null
@@ -0,0 +1,3166 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
+// place version causes additional stack usage with clang.
+LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
+                                        int16x8_t out[8]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+  const int16x8x2_t b2 = vtrnq_s16(in[4], in[5]);
+  const int16x8x2_t b3 = vtrnq_s16(in[6], in[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const uint16x8_t in[8],
+                                             uint16x8_t out[4]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  uint16x4x2_t b0 = vtrn_u16(vget_low_u16(in[0]), vget_low_u16(in[1]));
+  uint16x4x2_t b1 = vtrn_u16(vget_low_u16(in[2]), vget_low_u16(in[3]));
+  uint16x4x2_t b2 = vtrn_u16(vget_low_u16(in[4]), vget_low_u16(in[5]));
+  uint16x4x2_t b3 = vtrn_u16(vget_low_u16(in[6]), vget_low_u16(in[7]));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                             vreinterpret_u32_u16(b1.val[0]));
+  uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                             vreinterpret_u32_u16(b1.val[1]));
+  uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+                             vreinterpret_u32_u16(b3.val[0]));
+  uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+                             vreinterpret_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  out[0] = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+                        vreinterpret_u16_u32(c2.val[0]));
+  out[1] = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+                        vreinterpret_u16_u32(c3.val[0]));
+  out[2] = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+                        vreinterpret_u16_u32(c2.val[1]));
+  out[3] = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+                        vreinterpret_u16_u32(c3.val[1]));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const int16x8_t in[8],
+                                             int16x8_t out[4]) {
+  Transpose4x8To8x4(reinterpret_cast<const uint16x8_t*>(in),
+                    reinterpret_cast<uint16x8_t*>(out));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8(const int16x8_t in[4],
+                                             int16x8_t out[8]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+
+  // The upper 8 bytes are don't cares.
+  // out[0]: 00 10 20 30 04 14 24 34
+  // out[1]: 01 11 21 31 05 15 25 35
+  // out[2]: 02 12 22 32 06 16 26 36
+  // out[3]: 03 13 23 33 07 17 27 37
+  // out[4]: 04 14 24 34 04 14 24 34
+  // out[5]: 05 15 25 35 05 15 25 35
+  // out[6]: 06 16 26 36 06 16 26 36
+  // out[7]: 07 17 27 37 07 17 27 37
+  out[0] = vreinterpretq_s16_s32(c0.val[0]);
+  out[1] = vreinterpretq_s16_s32(c1.val[0]);
+  out[2] = vreinterpretq_s16_s32(c0.val[1]);
+  out[3] = vreinterpretq_s16_s32(c1.val[1]);
+  out[4] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[0])));
+  out[5] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c1.val[0]), vget_high_s32(c1.val[0])));
+  out[6] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c0.val[1]), vget_high_s32(c0.val[1])));
+  out[7] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c1.val[1]), vget_high_s32(c1.val[1])));
+}
+
+//------------------------------------------------------------------------------
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst,
+                                    int32_t stride, int32_t idx,
+                                    const int16x8_t* const s) {
+  assert(store_count % 4 == 0);
+  assert(store_width == 8 || store_width == 16);
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (store_width == 16) {
+    for (int i = 0; i < store_count; i += 4) {
+      vst1q_s16(&dst[i * stride + idx], (s[i]));
+      vst1q_s16(&dst[(i + 1) * stride + idx], (s[i + 1]));
+      vst1q_s16(&dst[(i + 2) * stride + idx], (s[i + 2]));
+      vst1q_s16(&dst[(i + 3) * stride + idx], (s[i + 3]));
+    }
+  } else {
+    // store_width == 8
+    for (int i = 0; i < store_count; i += 4) {
+      vst1_s16(&dst[i * stride + idx], vget_low_s16(s[i]));
+      vst1_s16(&dst[(i + 1) * stride + idx], vget_low_s16(s[i + 1]));
+      vst1_s16(&dst[(i + 2) * stride + idx], vget_low_s16(s[i + 2]));
+      vst1_s16(&dst[(i + 3) * stride + idx], vget_low_s16(s[i + 3]));
+    }
+  }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src,
+                                   int32_t stride, int32_t idx, int16x8_t* x) {
+  assert(load_count % 4 == 0);
+  assert(load_width == 8 || load_width == 16);
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (load_width == 16) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = vld1q_s16(&src[i * stride + idx]);
+      x[i + 1] = vld1q_s16(&src[(i + 1) * stride + idx]);
+      x[i + 2] = vld1q_s16(&src[(i + 2) * stride + idx]);
+      x[i + 3] = vld1q_s16(&src[(i + 3) * stride + idx]);
+    }
+  } else {
+    // load_width == 8
+    const int64x2_t zero = vdupq_n_s64(0);
+    for (int i = 0; i < load_count; i += 4) {
+      // The src buffer is aligned to 32 bytes.  Each load will always be 8
+      // byte aligned.
+      x[i] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[i * stride + idx]), zero, 0));
+      x[i + 1] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 1) * stride + idx]), zero,
+          0));
+      x[i + 2] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 2) * stride + idx]), zero,
+          0));
+      x[i + 3] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 3) * stride + idx]), zero,
+          0));
+    }
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int16x8_t* a, int16x8_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+  const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+  const int16x8_t x = vcombine_s16(x1, x1);
+  const int16x8_t y = vcombine_s16(y1, y1);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(int16x8_t* a, int16x8_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+  const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+  const int32x4_t acc_x_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+  const int32x4_t acc_y_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+  const int32x4_t x0_hi = vmlsl_n_s16(acc_x_hi, vget_high_s16(*b), sin128);
+  const int32x4_t y0_hi = vmlal_n_s16(acc_y_hi, vget_high_s16(*b), cos128);
+  const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+  const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+  const int16x8_t x = vcombine_s16(x1, x1_hi);
+  const int16x8_t y = vcombine_s16(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a,
+                                                         int16x8_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  // Clang < 14 targeting armv8.1-a+ optimizes vqrdmulhq_n_s16 and vqsubq_s16
+  // (in HadamardRotation) into vqrdmlshq_s16 resulting in an "off by one"
+  // error. This behavior was fixed in 14.0.0:
+  // https://github.com/llvm/llvm-project/commit/82973edfb72a95b442fa6d2bb404e15a4031855e
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
+    defined(__clang__) && __clang_major__ < 14
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmull_n_s16(vget_low_s16(*b), -sin128);
+  const int32x4_t y0 = vmull_n_s16(vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+  const int32x4_t x0_hi = vmull_n_s16(vget_high_s16(*b), -sin128);
+  const int32x4_t y0_hi = vmull_n_s16(vget_high_s16(*b), cos128);
+  const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+  const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+  const int16x8_t x = vcombine_s16(x1, x1_hi);
+  const int16x8_t y = vcombine_s16(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#else
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  // For this function, the max value returned by Sin128() is 4091, which fits
+  // inside 12 bits.  This leaves room for the sign bit and the 3 left shifted
+  // bits.
+  assert(sin128 <= 0xfff);
+  const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#endif
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int16x8_t* a,
+                                                          int16x8_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
+    defined(__clang__)  // ARM v8.1-A
+  // Clang optimizes vqrdmulhq_n_s16 and vqsubq_s16 (in HadamardRotation) into
+  // vqrdmlshq_s16 resulting in an "off by one" error. For now, do not use
+  // vqrdmulhq_n_s16().
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t y0 = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+  const int32x4_t x0_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+  const int32x4_t y0_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+  const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+  const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+  const int16x8_t x = vcombine_s16(x1, x1_hi);
+  const int16x8_t y = vcombine_s16(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#else
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#endif
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int16x8_t* a, int16x8_t* b,
+                                            bool flip) {
+  int16x8_t x, y;
+  if (flip) {
+    y = vqaddq_s16(*b, *a);
+    x = vqsubq_s16(*b, *a);
+  } else {
+    x = vqaddq_s16(*a, *b);
+    y = vqsubq_s16(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(int16x8_t* a, int16x8_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src);
+  const int16_t cos128 = Cos128(32);
+  const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3);
+  // vqrshlq_s16 will shift right if shift value is negative.
+  const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift));
+
+  if (width == 4) {
+    vst1_s16(dst, vget_low_s16(xy_shifted));
+  } else {
+    for (int i = 0; i < width; i += 8) {
+      vst1q_s16(dst, xy_shifted);
+      dst += 8;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int16x4_t v_src = vld1_s16(dst);
+    const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3);
+    vst1_s16(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int16x8_t v_src = vld1q_s16(&dst[i]);
+      const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3);
+      vst1q_s16(&dst[i], xy);
+      i += 8;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  HadamardRotation(&s[0], &s[3], false);
+  HadamardRotation(&s[1], &s[2], false);
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[4], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      assert(step == 4);
+      int16x8x4_t y = vld4q_s16(dst);
+      for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      assert(step == 4);
+      int16x4x4_t y = vld4_s16(dst);
+      for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]);
+    } else {
+      LoadSrc<8, 4>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8x4_t y;
+      for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+      vst4q_s16(dst, y);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      int16x4x4_t y;
+      for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]);
+      vst4_s16(dst, y);
+    } else {
+      StoreDst<8, 4>(dst, step, 0, s);
+    }
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false);
+  HadamardRotation(&s[6], &s[7], true);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  HadamardRotation(&s[0], &s[7], false);
+  HadamardRotation(&s[1], &s[6], false);
+  HadamardRotation(&s[2], &s[5], false);
+  HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else if (transpose) {
+    LoadSrc<16, 8>(dst, step, 0, x);
+    dsp::Transpose8x8(x);
+  } else {
+    LoadSrc<16, 8>(dst, step, 0, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, s);
+    }
+  } else if (transpose) {
+    dsp::Transpose8x8(s);
+    StoreDst<16, 8>(dst, step, 0, s);
+  } else {
+    StoreDst<16, 8>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false);
+  HadamardRotation(&s[10], &s[11], true);
+  HadamardRotation(&s[12], &s[13], false);
+  HadamardRotation(&s[14], &s[15], true);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false);
+  HadamardRotation(&s[9], &s[10], false);
+  HadamardRotation(&s[12], &s[15], true);
+  HadamardRotation(&s[13], &s[14], true);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  HadamardRotation(&s[0], &s[15], false);
+  HadamardRotation(&s[1], &s[14], false);
+  HadamardRotation(&s[2], &s[13], false);
+  HadamardRotation(&s[3], &s[12], false);
+  HadamardRotation(&s[4], &s[11], false);
+  HadamardRotation(&s[5], &s[10], false);
+  HadamardRotation(&s[6], &s[9], false);
+  HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    LoadSrc<16, 16>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (auto& i : s) {
+      i = vqrshlq_s16(i, v_row_shift);
+    }
+  }
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4(&s[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, s);
+    }
+  } else if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      dsp::Transpose8x8(&s[idx]);
+      StoreDst<16, 8>(dst, step, idx, &s[idx]);
+    }
+  } else {
+    StoreDst<16, 16>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false);
+  HadamardRotation(&s[18], &s[19], true);
+  HadamardRotation(&s[20], &s[21], false);
+  HadamardRotation(&s[22], &s[23], true);
+  HadamardRotation(&s[24], &s[25], false);
+  HadamardRotation(&s[26], &s[27], true);
+  HadamardRotation(&s[28], &s[29], false);
+  HadamardRotation(&s[30], &s[31], true);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false);
+  HadamardRotation(&s[17], &s[18], false);
+  HadamardRotation(&s[20], &s[23], true);
+  HadamardRotation(&s[21], &s[22], true);
+  HadamardRotation(&s[24], &s[27], false);
+  HadamardRotation(&s[25], &s[26], false);
+  HadamardRotation(&s[28], &s[31], true);
+  HadamardRotation(&s[29], &s[30], true);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false);
+  HadamardRotation(&s[17], &s[22], false);
+  HadamardRotation(&s[18], &s[21], false);
+  HadamardRotation(&s[19], &s[20], false);
+  HadamardRotation(&s[24], &s[31], true);
+  HadamardRotation(&s[25], &s[30], true);
+  HadamardRotation(&s[26], &s[29], true);
+  HadamardRotation(&s[27], &s[28], true);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  HadamardRotation(&s[0], &s[31], false);
+  HadamardRotation(&s[1], &s[30], false);
+  HadamardRotation(&s[2], &s[29], false);
+  HadamardRotation(&s[3], &s[28], false);
+  HadamardRotation(&s[4], &s[27], false);
+  HadamardRotation(&s[5], &s[26], false);
+  HadamardRotation(&s[6], &s[25], false);
+  HadamardRotation(&s[7], &s[24], false);
+  HadamardRotation(&s[8], &s[23], false);
+  HadamardRotation(&s[9], &s[22], false);
+  HadamardRotation(&s[10], &s[21], false);
+  HadamardRotation(&s[11], &s[20], false);
+  HadamardRotation(&s[12], &s[19], false);
+  HadamardRotation(&s[13], &s[18], false);
+  HadamardRotation(&s[14], &s[17], false);
+  HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_8>(s);
+  Dct8Stages<ButterflyRotation_8>(s);
+  Dct16Stages<ButterflyRotation_8>(s);
+  Dct32Stages<ButterflyRotation_8>(s);
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int16x8_t output[8];
+      Transpose8x8(&s[idx], output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 32>(dst, step, 0, s);
+  }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false);
+  HadamardRotation(&s[34], &s[35], true);
+  HadamardRotation(&s[36], &s[37], false);
+  HadamardRotation(&s[38], &s[39], true);
+  HadamardRotation(&s[40], &s[41], false);
+  HadamardRotation(&s[42], &s[43], true);
+  HadamardRotation(&s[44], &s[45], false);
+  HadamardRotation(&s[46], &s[47], true);
+  HadamardRotation(&s[48], &s[49], false);
+  HadamardRotation(&s[50], &s[51], true);
+  HadamardRotation(&s[52], &s[53], false);
+  HadamardRotation(&s[54], &s[55], true);
+  HadamardRotation(&s[56], &s[57], false);
+  HadamardRotation(&s[58], &s[59], true);
+  HadamardRotation(&s[60], &s[61], false);
+  HadamardRotation(&s[62], &s[63], true);
+
+  // stage 7.
+  ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false);
+  HadamardRotation(&s[33], &s[34], false);
+  HadamardRotation(&s[36], &s[39], true);
+  HadamardRotation(&s[37], &s[38], true);
+  HadamardRotation(&s[40], &s[43], false);
+  HadamardRotation(&s[41], &s[42], false);
+  HadamardRotation(&s[44], &s[47], true);
+  HadamardRotation(&s[45], &s[46], true);
+  HadamardRotation(&s[48], &s[51], false);
+  HadamardRotation(&s[49], &s[50], false);
+  HadamardRotation(&s[52], &s[55], true);
+  HadamardRotation(&s[53], &s[54], true);
+  HadamardRotation(&s[56], &s[59], false);
+  HadamardRotation(&s[57], &s[58], false);
+  HadamardRotation(&s[60], &s[63], true);
+  HadamardRotation(&s[61], &s[62], true);
+
+  // stage 16.
+  ButterflyRotation_8(&s[61], &s[34], 56, true);
+  ButterflyRotation_8(&s[60], &s[35], 56, true);
+  ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false);
+  HadamardRotation(&s[33], &s[38], false);
+  HadamardRotation(&s[34], &s[37], false);
+  HadamardRotation(&s[35], &s[36], false);
+  HadamardRotation(&s[40], &s[47], true);
+  HadamardRotation(&s[41], &s[46], true);
+  HadamardRotation(&s[42], &s[45], true);
+  HadamardRotation(&s[43], &s[44], true);
+  HadamardRotation(&s[48], &s[55], false);
+  HadamardRotation(&s[49], &s[54], false);
+  HadamardRotation(&s[50], &s[53], false);
+  HadamardRotation(&s[51], &s[52], false);
+  HadamardRotation(&s[56], &s[63], true);
+  HadamardRotation(&s[57], &s[62], true);
+  HadamardRotation(&s[58], &s[61], true);
+  HadamardRotation(&s[59], &s[60], true);
+
+  // stage 25.
+  ButterflyRotation_8(&s[59], &s[36], 48, true);
+  ButterflyRotation_8(&s[58], &s[37], 48, true);
+  ButterflyRotation_8(&s[57], &s[38], 48, true);
+  ButterflyRotation_8(&s[56], &s[39], 48, true);
+  ButterflyRotation_8(&s[55], &s[40], 112, true);
+  ButterflyRotation_8(&s[54], &s[41], 112, true);
+  ButterflyRotation_8(&s[53], &s[42], 112, true);
+  ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false);
+  HadamardRotation(&s[33], &s[46], false);
+  HadamardRotation(&s[34], &s[45], false);
+  HadamardRotation(&s[35], &s[44], false);
+  HadamardRotation(&s[36], &s[43], false);
+  HadamardRotation(&s[37], &s[42], false);
+  HadamardRotation(&s[38], &s[41], false);
+  HadamardRotation(&s[39], &s[40], false);
+  HadamardRotation(&s[48], &s[63], true);
+  HadamardRotation(&s[49], &s[62], true);
+  HadamardRotation(&s[50], &s[61], true);
+  HadamardRotation(&s[51], &s[60], true);
+  HadamardRotation(&s[52], &s[59], true);
+  HadamardRotation(&s[53], &s[58], true);
+  HadamardRotation(&s[54], &s[57], true);
+  HadamardRotation(&s[55], &s[56], true);
+
+  // stage 30.
+  ButterflyRotation_8(&s[55], &s[40], 32, true);
+  ButterflyRotation_8(&s[54], &s[41], 32, true);
+  ButterflyRotation_8(&s[53], &s[42], 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 32, true);
+  ButterflyRotation_8(&s[50], &s[45], 32, true);
+  ButterflyRotation_8(&s[49], &s[46], 32, true);
+  ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+  }
+  //-- end dct 64 stages
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int16x8_t output[8];
+      Transpose8x8(&s[idx], output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 64>(dst, step, 0, s);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
+                                      bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int32x4_t s[7];
+  int16x4_t x[4];
+
+  if (transpose) {
+    assert(step == 4);
+    int16x4x4_t y = vld4_s16(dst);
+    for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+  } else {
+    x[0] = vld1_s16(dst);
+    x[1] = vld1_s16(dst + 1 * step);
+    x[2] = vld1_s16(dst + 2 * step);
+    x[3] = vld1_s16(dst + 3 * step);
+  }
+
+  // stage 1.
+  s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]);
+  s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubl_s16(x[0], x[2]);
+  const int32x4_t b7 = vaddw_s16(a7, x[3]);
+
+  // stage 3.
+  s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]);
+  s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]);
+
+  s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+  const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
+  const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+  x[0] = dst_0;
+  x[1] = dst_1;
+  x[2] = dst_2;
+  x[3] = dst_3;
+
+  if (transpose) {
+    int16x4x4_t y;
+    for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+    vst4_s16(dst, y);
+  } else {
+    vst1_s16(dst, x[0]);
+    vst1_s16(dst + 1 * step, x[1]);
+    vst1_s16(dst + 2 * step, x[2]);
+    vst1_s16(dst + 3 * step, x[3]);
+  }
+}
+
+alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                          2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int32x4_t s[2];
+
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int16x4_t kAdst4DcOnlyMultipliers = vld1_s16(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmull_s16(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int16x4_t dst_0 = vqrshrn_n_s32(x3, 12);
+
+  // vqrshlq_s16 will shift right if shift value is negative.
+  vst1_s16(dst, vqrshl_s16(dst_0, vdup_n_s16(-row_shift)));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int16x4_t v_src = vld1_s16(&dst[i]);
+
+    s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]);
+    s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]);
+    s[2] = vmull_n_s16(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+    const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+    const int16x4_t dst_2 = vqrshrn_n_s32(x2, 12);
+    const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+    vst1_s16(&dst[i], dst_0);
+    vst1_s16(&dst[i + width * 1], dst_1);
+    vst1_s16(&dst[i + width * 2], dst_2);
+    vst1_s16(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step,
+                                      bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      LoadSrc<16, 8>(dst, step, 0, x);
+      dsp::Transpose8x8(x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[3], &s[7], false);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s16(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s16(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s16(s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      dsp::Transpose8x8(x);
+      StoreDst<16, 8>(dst, step, 0, x);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8];
+
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  // stage 1.
+  s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int16x8_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s16(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s16(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s16(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s16 will shift right if shift value is negative.
+    x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+    vst1q_lane_s16(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8];
+
+  int i = 0;
+  do {
+    const int16x8_t v_src = vld1q_s16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int16x8_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s16(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s16(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s16(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s16(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (is_row) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+        dsp::Transpose8x8(&x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false);
+  HadamardRotation(&s[1], &s[9], false);
+  HadamardRotation(&s[2], &s[10], false);
+  HadamardRotation(&s[3], &s[11], false);
+  HadamardRotation(&s[4], &s[12], false);
+  HadamardRotation(&s[5], &s[13], false);
+  HadamardRotation(&s[6], &s[14], false);
+  HadamardRotation(&s[7], &s[15], false);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[8], &s[12], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[9], &s[13], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[10], &s[14], false);
+  HadamardRotation(&s[3], &s[7], false);
+  HadamardRotation(&s[11], &s[15], false);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[8], &s[10], false);
+  HadamardRotation(&s[12], &s[14], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+  HadamardRotation(&s[9], &s[11], false);
+  HadamardRotation(&s[13], &s[15], false);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s16(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s16(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s16(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s16(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s16(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s16(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s16(s[1]);
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+      int16x8_t output[4];
+      Transpose4x8To8x4(x, output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4(&x[8], output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (is_row) {
+      const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+      for (int idx = 0; idx < 16; idx += 8) {
+        int16x8_t output[8];
+        Transpose8x8(&x[idx], output);
+        for (auto& o : output) {
+          o = vqrshlq_s16(o, v_row_shift);
+        }
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int16x8_t* s, int16x8_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s16(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s16(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s16(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s16(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s16(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s16(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s16(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16];
+  int16x8_t x[16];
+
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  // stage 1.
+  s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s16 will shift right if shift value is negative.
+    x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+    vst1q_lane_s16(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    int16x8_t s[16];
+    int16x8_t x[16];
+    const int16x8_t v_src = vld1q_s16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  if (is_row_shift) {
+    const int shift = 1;
+    const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+    const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+    const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+    for (int i = 0; i < 4; i += 2) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+      const int32x4_t v_src_mult_lo =
+          vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlal_s16(v_dual_round, vget_high_s16(v_src), v_multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s16(&dst[i * step],
+                vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+    }
+  } else {
+    for (int i = 0; i < 4; i += 2) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+      const int16x8_t a =
+          vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+      const int16x8_t b = vqaddq_s16(v_src, a);
+      vst1q_s16(&dst[i * step], b);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  if (identity_size < 32) {
+    if (tx_width == 4) {
+      uint8x8_t frame_data = vdup_n_u8(0);
+      int i = 0;
+      do {
+        const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+
+        int16x4_t v_dst_i;
+        if (identity_size == 4) {
+          const int16x4_t v_src_fraction =
+              vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+          v_dst_i = vqadd_s16(v_src, v_src_fraction);
+        } else if (identity_size == 8) {
+          v_dst_i = vqadd_s16(v_src, v_src);
+        } else {  // identity_size == 16
+          const int16x4_t v_src_mult =
+              vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+          const int16x4_t v_srcx2 = vqadd_s16(v_src, v_src);
+          v_dst_i = vqadd_s16(v_srcx2, v_src_mult);
+        }
+
+        frame_data = Load4<0>(dst, frame_data);
+        const int16x4_t a = vrshr_n_s16(v_dst_i, 4);
+        const uint16x8_t b =
+            vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        StoreLo4(dst, d);
+        dst += stride;
+      } while (++i < tx_height);
+    } else {
+      int i = 0;
+      do {
+        const int row = i * tx_width;
+        int j = 0;
+        do {
+          const int16x8_t v_src = vld1q_s16(&source[row + j]);
+
+          int16x8_t v_dst_i;
+          if (identity_size == 4) {
+            const int16x8_t v_src_fraction =
+                vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+            v_dst_i = vqaddq_s16(v_src, v_src_fraction);
+          } else if (identity_size == 8) {
+            v_dst_i = vqaddq_s16(v_src, v_src);
+          } else {  // identity_size == 16
+            const int16x8_t v_src_mult =
+                vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+            const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+            v_dst_i = vqaddq_s16(v_src_mult, v_srcx2);
+          }
+
+          const uint8x8_t frame_data = vld1_u8(dst + j);
+          const int16x8_t a = vrshrq_n_s16(v_dst_i, 4);
+          const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+          const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+          vst1_u8(dst + j, d);
+          j += 8;
+        } while (j < tx_width);
+        dst += stride;
+      } while (++i < tx_height);
+    }
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int16x8_t v_dst_i = vld1q_s16(&source[row + j]);
+        const uint8x8_t frame_data = vld1_u8(dst + j);
+        const int16x8_t a = vrshrq_n_s16(v_dst_i, 2);
+        const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        vst1_u8(dst + j, d);
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    uint8x8_t frame_data = vdup_n_u8(0);
+    int i = 0;
+    do {
+      const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+      const int16x4_t v_src_mult =
+          vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+      const int16x4_t v_dst_row = vqadd_s16(v_src, v_src_mult);
+      const int16x4_t v_src_mult2 =
+          vqrdmulh_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+      const int16x4_t v_dst_col = vqadd_s16(v_dst_row, v_src_mult2);
+      frame_data = Load4<0>(dst, frame_data);
+      const int16x4_t a = vrshr_n_s16(v_dst_col, 4);
+      const uint16x8_t b =
+          vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      StoreLo4(dst, d);
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int16x8_t v_src = vld1q_s16(&source[row + j]);
+        const int16x8_t v_src_round =
+            vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+        const int16x8_t v_dst_row = vqaddq_s16(v_src_round, v_src_round);
+        const int16x8_t v_src_mult2 =
+            vqrdmulhq_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+        const int16x8_t v_dst_col = vqaddq_s16(v_dst_row, v_src_mult2);
+        const uint8x8_t frame_data = vld1_u8(dst + j);
+        const int16x8_t a = vrshrq_n_s16(v_dst_col, 4);
+        const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        vst1_u8(dst + j, d);
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+    const int16x8_t a = vrshrq_n_s16(v_src, 1);
+    vst1q_s16(&dst[i * step], a);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+    // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+    // saturating add here is ok.
+    const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+    vst1q_s16(&dst[i * step], v_srcx2);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]);
+      const int32x4_t v_src_mult_lo =
+          vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi = vmlal_n_s16(
+          v_dual_round, vget_high_s16(v_src), kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s16(&dst[i * step + j * 8],
+                vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int16x4_t v_multiplier = vdup_n_s16(kIdentity16Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo =
+      vmlal_s16(v_dual_round, (v_src), v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+                                                const int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 32; j += 8) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j]);
+      // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+      // saturating add here is ok.
+      const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src);
+      vst1q_s16(&dst[i * step + j], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src);
+  vst1_lane_s16(dst, v_dst_0, 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Transposes a 4x4 matrix and then permutes the rows of the transposed matrix
+// for the WHT. The input matrix is in two "wide" int16x8_t variables. The
+// output matrix is in four int16x4_t variables.
+//
+// Input:
+// in[0]: 00 01 02 03  10 11 12 13
+// in[1]: 20 21 22 23  30 31 32 33
+// Output:
+// out[0]: 00 10 20 30
+// out[1]: 03 13 23 33
+// out[2]: 01 11 21 31
+// out[3]: 02 12 22 32
+LIBGAV1_ALWAYS_INLINE void TransposeAndPermute4x4WideInput(
+    const int16x8_t in[2], int16x4_t out[4]) {
+  // Swap 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03  10 11 12 13
+  // in[1]: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+
+  // Swap 16 bit elements. Goes from:
+  // vget_low_s32(b0.val[0]):  00 01 20 21
+  // vget_high_s32(b0.val[0]): 10 11 30 31
+  // vget_low_s32(b0.val[1]):  02 03 22 23
+  // vget_high_s32(b0.val[1]): 12 13 32 33
+  // to:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 01 11 21 32
+  // c1.val[0]: 02 12 22 32
+  // c1.val[1]: 03 13 23 33
+
+  const int16x4x2_t c0 =
+      vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+               vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+  const int16x4x2_t c1 =
+      vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+               vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+
+  out[0] = c0.val[0];
+  out[1] = c1.val[1];
+  out[2] = c0.val[1];
+  out[3] = c1.val[0];
+}
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* LIBGAV1_RESTRICT dst,
+                                     const int dst_stride,
+                                     const void* LIBGAV1_RESTRICT source,
+                                     const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int16_t*>(source);
+  int16x4_t s[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int16_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int16_t g = f >> 1;
+    f = f - (f >> 1);
+    const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int16_t i = (src[0] >> 4);
+    s[0] = vdup_n_s16(h);
+    s[0] = vset_lane_s16(f, s[0], 0);
+    s[1] = vdup_n_s16(i);
+    s[1] = vset_lane_s16(g, s[1], 0);
+    s[2] = s[3] = s[1];
+  } else {
+    // Load the 4x4 source in transposed form.
+    int16x4x4_t columns = vld4_s16(src);
+    // Shift right and permute the columns for the WHT.
+    s[0] = vshr_n_s16(columns.val[0], 2);
+    s[2] = vshr_n_s16(columns.val[1], 2);
+    s[3] = vshr_n_s16(columns.val[2], 2);
+    s[1] = vshr_n_s16(columns.val[3], 2);
+
+    // Row transforms.
+    s[0] = vadd_s16(s[0], s[2]);
+    s[3] = vsub_s16(s[3], s[1]);
+    int16x4_t e = vhsub_s16(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsub_s16(e, s[1]);
+    s[2] = vsub_s16(e, s[2]);
+    s[0] = vsub_s16(s[0], s[1]);
+    s[3] = vadd_s16(s[3], s[2]);
+
+    int16x8_t x[2];
+    x[0] = vcombine_s16(s[0], s[1]);
+    x[1] = vcombine_s16(s[2], s[3]);
+    TransposeAndPermute4x4WideInput(x, s);
+
+    // Column transforms.
+    s[0] = vadd_s16(s[0], s[2]);
+    s[3] = vsub_s16(s[3], s[1]);
+    e = vhsub_s16(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsub_s16(e, s[1]);
+    s[2] = vsub_s16(e, s[2]);
+    s[0] = vsub_s16(s[0], s[1]);
+    s[3] = vadd_s16(s[3], s[2]);
+  }
+
+  // Store to frame.
+  uint8x8_t frame_data = vdup_n_u8(0);
+  for (int row = 0; row < 4; row += 2) {
+    frame_data = Load4<0>(dst, frame_data);
+    frame_data = Load4<1>(dst + dst_stride, frame_data);
+    const int16x8_t residual = vcombine_s16(s[row], s[row + 1]);
+    const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(residual), frame_data);
+    frame_data = vqmovun_s16(vreinterpretq_s16_u16(b));
+    StoreLo4(dst, frame_data);
+    dst += dst_stride;
+    StoreHi4(dst, frame_data);
+    dst += dst_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vld1q_s16(&source[i + 8]);
+      const int16x8_t c = vrev64q_s16(a);
+      const int16x8_t d = vrev64q_s16(b);
+      vst1q_s16(&source[i], vcombine_s16(vget_high_s16(d), vget_low_s16(d)));
+      vst1q_s16(&source[i + 8],
+                vcombine_s16(vget_high_s16(c), vget_low_s16(c)));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vrev64q_s16(a);
+      vst1q_s16(&source[i], vcombine_s16(vget_high_s16(b), vget_low_s16(b)));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      vst1q_s16(&source[i], vrev64q_s16(a));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+      vst1q_s16(&source[i], b);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      // The last 32 values of every row are always zero if the |tx_width| is
+      // 64.
+      const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+      int j = 0;
+      do {
+        const int16x8_t a = vld1q_s16(&source[i * tx_width + j]);
+        const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+        vst1q_s16(&source[i * tx_width + j], b);
+        j += 8;
+      } while (j < non_zero_width);
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s16 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const int16x8_t residual = vld1q_s16(&source[i]);
+      vst1q_s16(&source[i], vqrshlq_s16(residual, vdupq_n_s16(row_shift)));
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      for (int j = 0; j < tx_width; j += 8) {
+        const int16x8_t residual = vld1q_s16(&source[i * tx_width + j]);
+        const int16x8_t residual_shifted =
+            vqrshlq_s16(residual, vdupq_n_s16(row_shift));
+        vst1q_s16(&source[i * tx_width + j], residual_shifted);
+      }
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int16_t* LIBGAV1_RESTRICT source,
+    TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  // Enable for 4x4, 4x8, 4x16
+  if (tx_height < 32 && tx_width == 4) {
+    uint8x8_t frame_data = vdup_n_u8(0);
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int16x4_t residual = vld1_s16(&source[row]);
+      frame_data = Load4<0>(dst, frame_data);
+      const int16x4_t a = vrshr_n_s16(residual, 4);
+      const uint16x8_t b =
+          vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      StoreLo4(dst, d);
+      dst += stride;
+    }
+    // Enable for 8x4, 8x8, 8x16, 8x32
+  } else if (tx_height < 64 && tx_width == 8) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+      const int16x8_t residual = vld1q_s16(&source[row]);
+      const uint8x8_t frame_data = vld1_u8(dst);
+      const int16x8_t a = vrshrq_n_s16(residual, 4);
+      const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      vst1_u8(dst, d);
+      dst += stride;
+    }
+    // Remaining widths >= 16.
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int16x8_t residual = vld1q_s16(&source[row + j]);
+        const int16x8_t residual_hi = vld1q_s16(&source[row + j + 8]);
+        const uint8x16_t frame_data = vld1q_u8(frame[y] + x);
+        const int16x8_t a = vrshrq_n_s16(residual, 4);
+        const int16x8_t a_hi = vrshrq_n_s16(residual_hi, 4);
+        const uint16x8_t b =
+            vaddw_u8(vreinterpretq_u16_s16(a), vget_low_u8(frame_data));
+        const uint16x8_t b_hi =
+            vaddw_u8(vreinterpretq_u16_s16(a_hi), vget_high_u8(frame_data));
+        vst1q_u8(frame[y] + x,
+                 vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(b)),
+                             vqmovun_s16(vreinterpretq_s16_u16(b_hi))));
+        j += 16;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_NEON<ButterflyRotation_4, false>(src, /*step=*/4, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_8, true>(data, /*step=*/4,
+                                           /*transpose=*/true);
+      data += 32;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct4 columns in parallel.
+      Dct4_NEON<ButterflyRotation_4, false>(src, tx_width, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct4 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Dct4_NEON<ButterflyRotation_8, true>(data, tx_width,
+                                             /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                            /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct8 columns in parallel.
+      Dct8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct8 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Dct8_NEON<ButterflyRotation_8, false>(data, tx_width,
+                                              /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                             row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct16 columns in parallel.
+      Dct16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                            /*row_shift=*/0);
+    } else {
+      int i = tx_width;
+      auto* data = src;
+      do {
+        // Process 8 1d dct16 columns in parallel per iteration.
+        Dct16_NEON<ButterflyRotation_8, false>(data, tx_width, /*is_row=*/false,
+                                               /*row_shift=*/0);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON(data, /*step=*/4, /*transpose=*/true);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON(data, tx_width, /*transpose=*/false);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                             /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst8 columns in parallel.
+      Adst8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d adst8 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Adst8_NEON<ButterflyRotation_8, false>(data, tx_width,
+                                               /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                              row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst16 columns in parallel.
+      Adst16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                             /*row_shift=*/0);
+    } else {
+      int i = tx_width;
+      auto* data = src;
+      do {
+        // Process 8 1d adst16 columns in parallel per iteration.
+        Adst16_NEON<ButterflyRotation_8, false>(
+            data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<false>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  } else {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<true>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  }
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16.  The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = adjusted_tx_height;
+  do {
+    Identity32Row16_NEON(src, /*step=*/32);
+    src += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+                               int /*adjusted_tx_height*/, void* /*src_buffer*/,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int16_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  uint8_t* dst = frame[start_y] + start_x;
+  const int dst_stride = frame.columns();
+  Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      Identity32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      Identity32TransformLoopColumn_NEON;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      Wht4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      Wht4TransformLoopColumn_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
new file mode 100644 (file)
index 0000000..ebd7cf4
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc
new file mode 100644 (file)
index 0000000..abdc074
--- /dev/null
@@ -0,0 +1,1216 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
+  const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+  return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
+                                 const uint16x4_t q0, const uint16x4_t q1,
+                                 const uint16_t outer_thresh) {
+  const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+  const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+  const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+  const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+  const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+  return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16x8_t abd_p2p3_q2q3,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterNMasks functions.
+
+inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const hev_mask,
+                         uint16x4_t* const needs_filter4_mask) {
+  const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  // This includes cases where NeedsFilter4() is not true and so Filter2() will
+  // not be applied.
+  const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+  *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
+                          const uint16x8_t abd_p0p2_q0q2) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                         const uint16x8_t p0q0, const uint16_t hev_thresh,
+                         const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter6_mask,
+                         uint16x4_t* const is_flat3_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
+  *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+                                     inner_thresh, outer_mask);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
+                          const uint16x8_t abd_pn1p0_qn1q0,
+                          const uint16x8_t abd_pn2p0_qn2q0) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+  const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                         const uint16x8_t p1q1, const uint16x8_t p0q0,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter8_mask,
+                         uint16x4_t* const is_flat4_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  const uint16x4_t is_flat4 =
+      IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
+  *needs_filter8_mask =
+      NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+                   inner_thresh, outer_mask);
+  // |is_flat4_mask| is used to decide where to use the result of Filter8.
+  // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+  // overriding the question of whether to use Filter8. Because Filter4 doesn't
+  // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
+  // source value. To be correct, the mask must account for this override.
+  *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterN functions.
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+                    const uint16x8_t p1q1, const uint16x4_t hev_mask,
+                    uint16x8_t* const p1q1_result,
+                    uint16x8_t* const p0q0_result) {
+  const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // q0mp0 means "q0 minus p0".
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
+  const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int16x4_t p1mq1_saturated =
+      Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
+  const int16x4_t hev_option =
+      vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+  // Need to figure out what's going on here because there are some unnecessary
+  // tricks to accommodate 8x8 as smallest 8bpp vector
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four =
+      Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t plus_three =
+      Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+  const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+  // a3 = (a1 + 1) >> 1;
+  const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+  const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+  const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+  // Need to shift the second term or we end up with a2_ma2.
+  const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+  const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+  *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
+  *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+
+  const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
+                             vld1_u16(dst_q0), vld1_u16(dst_q1)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Offset by 2 uint16_t values to load from first p1 position.
+  auto* dst = static_cast<uint8_t*>(dest) - 4;
+  auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
+  auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
+  auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
+
+  uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+                       vld1_u16(dst_q1)};
+  Transpose4x4(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  vst1_u16(dst_p1, output[0]);
+  vst1_u16(dst_p0, output[1]);
+  vst1_u16(dst_q0, output[2]);
+  vst1_u16(dst_q1, output[3]);
+}
+
+inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                    const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions.
+  // The formula is regrouped to allow 3 doubling operations to be combined.
+  //
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                    ^^^^^^^^^^^
+  uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                                ^^^^^^
+  sum = vaddq_u16(sum, p0q0);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //               ^^^^^
+  sum = vshlq_n_u16(sum, 1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //        ^^^^^^                          ^^^^^^
+  // Should dual issue with the left shift.
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+  sum = vaddq_u16(sum, outer_sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //                ^^^^^^^^
+  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //        ^^^^^^^^
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+
+  const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
+                             vld1_u16(dst_p0), vld1_u16(dst_q0),
+                             vld1_u16(dst_q1), vld1_u16(dst_q2)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Left side of the filter window.
+  auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Overread by 2 values. These overreads become the high halves of src_raw[2]
+  // and src_raw[3] after transpose.
+  uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                           vld1q_u16(dst_3)};
+  Transpose4x8(src_raw);
+  // p2, p1, p0, q0, q1, q2
+  const uint16x4_t src[6] = {
+      vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
+      vget_low_u16(src_raw[2]),  vget_low_u16(src_raw[3]),
+      vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+  };
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  // dst_n starts at p2, so adjust to p1.
+  vst1_u16(dst_0 + 1, output[0]);
+  vst1_u16(dst_1 + 1, output[1]);
+  vst1_u16(dst_2 + 1, output[2]);
+  vst1_u16(dst_3 + 1, output[3]);
+}
+
+inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                    const uint16x8_t p1q1, const uint16x8_t p0q0,
+                    uint16x8_t* const p2q2_output,
+                    uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddq_u16(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p2q2_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  const uint16x4_t src[8] = {
+      vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
+      vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+  const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+  const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+  const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
+  return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+  // To get desired pairs after transpose, one half should be reversed.
+  uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                       vld1q_u16(dst_3)};
+
+  // src[0] = p0q0
+  // src[1] = p1q1
+  // src[2] = p2q2
+  // src[3] = p3q3
+  LoopFilterTranspose4x8(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+      vget_high_u16(src[1]), outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = src[0];
+  const uint16x8_t p1q1 = src[1];
+  const uint16x8_t p2q2 = src[2];
+  const uint16x8_t p3q3 = src[3];
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
+  // After transpose, |output| will contain rows of the form:
+  // p0 p1 p2 p3 q0 q1 q2 q3
+  Transpose4x8(output);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, ReverseLowHalf(output[0]));
+  vst1q_u16(dst_1, ReverseLowHalf(output[1]));
+  vst1q_u16(dst_2, ReverseLowHalf(output[2]));
+  vst1q_u16(dst_3, ReverseLowHalf(output[3]));
+}
+
+inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
+                     const uint16x8_t p4q4, const uint16x8_t p3q3,
+                     const uint16x8_t p2q2, const uint16x8_t p1q1,
+                     const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
+                     uint16x8_t* const p4q4_output,
+                     uint16x8_t* const p3q3_output,
+                     uint16x8_t* const p2q2_output,
+                     uint16x8_t* const p1q1_output,
+                     uint16x8_t* const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions.
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^^^^^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^^^^^^^^^^^^
+  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+  sum = vaddq_u16(sum, p6q6_x7);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p5q5_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+  const uint16x8_t q3p3 = Transpose64(p3q3);
+  sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+  const uint16x8_t q4p4 = Transpose64(p4q4);
+  sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+  const uint16x8_t q5p5 = Transpose64(p5q5);
+  sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+                       int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
+  auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
+  auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+  auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
+  auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
+  auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
+
+  const uint16x4_t src[14] = {
+      vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+      vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+      vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+      vld1_u16(dst_q5), vld1_u16(dst_q6)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+  const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+  const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+  const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+
+  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
+  uint16x8x2_t acdb;
+#if defined(__aarch64__)
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+                     vreinterpretq_u64_u16(ab), 1));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+                     vreinterpretq_u64_u16(ab), 0));
+#endif  // defined(__aarch64__)
+  return acdb;
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                     int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Low halves:  p7 p6 p5 p4
+  // High halves: p3 p2 p1 p0
+  uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                         vld1q_u16(dst_3)};
+  // p7 will be the low half of src_p[0]. Not used until the end.
+  Transpose4x8(src_p);
+
+  // Low halves:  q0 q1 q2 q3
+  // High halves: q4 q5 q6 q7
+  uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+                         vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
+  // q7 will be the high half of src_q[3]. Not used until the end.
+  Transpose4x8(src_q);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+      vget_low_u16(src_q[1]), outer_thresh);
+  const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+  const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+  const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+  const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 =
+      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+  const uint16x8_t p5q5 =
+      vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+  const uint16x8_t p6q6 =
+      vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+  const uint16x8_t p7q7 =
+      vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+  // To get the correctly ordered rows from the transpose, we need:
+  // p7p3 p6p2 p5p1 p4p0
+  // q0q4 q1q5 q2q6 q3q7
+  const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
+  const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
+  const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
+  const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
+  uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+                            p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
+  Transpose4x8(output_p);
+  uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+                            p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
+  Transpose4x8(output_q);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, output_p[0]);
+  vst1q_u16(dst_0 + 8, output_q[0]);
+  vst1q_u16(dst_1, output_p[1]);
+  vst1q_u16(dst_1 + 8, output_q[1]);
+  vst1q_u16(dst_2, output_p[2]);
+  vst1q_u16(dst_2 + 8, output_q[2]);
+  vst1q_u16(dst_3, output_p[3]);
+  vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+}  // namespace
+
+void LoopFilterInit10bpp_NEON() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Horizontal4_NEON;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Horizontal6_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Horizontal8_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Vertical14_NEON;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
new file mode 100644 (file)
index 0000000..a8b236d
--- /dev/null
@@ -0,0 +1,1162 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
+  const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
+  return vorr_u8(a, RightShiftVector<32>(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                                const uint8_t outer_thresh) {
+  const uint8x8x2_t a = Interleave32(p0q0, p1q1);
+  const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
+  const uint8x8_t p0q0_double = vqadd_u8(b, b);
+  const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
+  const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
+  return vcle_u8(c, vdup_n_u8(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   OuterThreshold()
+inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                         const uint8_t hev_thresh, const uint8_t outer_thresh,
+                         const uint8_t inner_thresh, uint8x8_t* const hev_mask,
+                         uint8x8_t* const needs_filter4_mask) {
+  // First half is |p0 - p1|, second half is |q0 - q1|.
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  // This includes cases where NeedsFilter4() is not true and so Filter2() will
+  // not be applied.
+  const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask =
+      NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh);
+
+  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+  *hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
+                    const uint8x8_t hev_mask, uint8x8_t* const p1q1_result,
+                    uint8x8_t* const p0q0_result) {
+  const int16x4_t zero = vdup_n_s16(0);
+
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero));
+  const int8x8_t hev_option =
+      vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a =
+      vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option));
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4));
+  const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3));
+  const int8x8_t a2_a1 =
+      vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3);
+
+  // a3 is in the high 4 values.
+  // a3 = (a1 + 1) >> 1;
+  const int8x8_t a3 = vrshr_n_s8(a2_a1, 1);
+
+  const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1));
+  const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1));
+
+  const int16x8_t p1q1_l =
+      vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l));
+
+  const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3));
+  const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3);
+
+  const int16x8_t p0q0_l =
+      vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
+  // Need to shift the second term or we end up with a2_ma2.
+  const int8x8_t a2_ma1 =
+      InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
+  const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
+
+  *p1q1_result = vqmovun_s16(p1q1_a3);
+  *p0q0_result = vqmovun_s16(p0q0_a);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+
+  uint8x8_t hev_mask;
+  uint8x8_t needs_filter4_mask;
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+  // Copy the masks to the high bits for packed comparisons later.
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+  needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+  const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1);
+
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+  StoreHi4(dst + stride, p1q1_output);
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  // Move |dst| to the left side of the filter window.
+  dst -= 2;
+
+  // |p1q0| and |p0q1| are named for the values they will contain after the
+  // transpose.
+  const uint8x8_t row0 = Load4(dst);
+  uint8x8_t p1q0 = Load4<1>(dst + stride, row0);
+  const uint8x8_t row2 = Load4(dst + 2 * stride);
+  uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2);
+
+  Transpose4x4(&p1q0, &p0q1);
+  // Rearrange.
+  const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1));
+  const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0],
+                                 Transpose32(p1q1xq0p0.val[1])};
+
+  uint8x8_t hev_mask;
+  uint8x8_t needs_filter4_mask;
+  Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh,
+               inner_thresh, &hev_mask, &needs_filter4_mask);
+
+  // Copy the masks to the high bits for packed comparisons later.
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+  needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint8x8_t p0q0_output =
+      vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+  const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]);
+
+  // Put things back in order to reverse the transpose.
+  const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+  uint8x8_t output_0 = p1p0xq1q0.val[0],
+            output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+  Transpose4x4(&output_0, &output_1);
+
+  StoreLo4(dst, output_0);
+  StoreLo4(dst + stride, output_1);
+  StoreHi4(dst + 2 * stride, output_0);
+  StoreHi4(dst + 3 * stride, output_1);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
+                         const uint8x8_t abd_p0p2_q0q2) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
+  return vand_u8(b, RightShiftVector<32>(b));
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   OuterThreshold()
+inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t abd_p1p2_q1q2,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1,
+                         const uint8x8_t p0q0, const uint8_t hev_thresh,
+                         const uint8_t outer_thresh, const uint8_t inner_thresh,
+                         uint8x8_t* const needs_filter6_mask,
+                         uint8x8_t* const is_flat3_mask,
+                         uint8x8_t* const hev_mask) {
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2));
+  *needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1,
+                                     inner_thresh, outer_thresh);
+}
+
+inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
+                    const uint8x8_t p0q0, uint8x8_t* const p1q1_output,
+                    uint8x8_t* const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
+  uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                 ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                      ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                            ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //           ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                                       ^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //      ^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p1q1_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum);
+
+  *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+
+  uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+  needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+  is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter6_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+    // Filter6() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f6_p1q1 = zero;
+    f6_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  // Move |dst| to the left side of the filter window.
+  dst -= 3;
+
+  // |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will
+  // contain after the transpose.
+  // These over-read by 2 bytes. We only need 6.
+  uint8x8_t p2q1 = vld1_u8(dst);
+  uint8x8_t p1q2 = vld1_u8(dst + stride);
+  uint8x8_t p0xx = vld1_u8(dst + 2 * stride);
+  uint8x8_t q0xx = vld1_u8(dst + 3 * stride);
+
+  Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx);
+
+  const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+  const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+  const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+  const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx);
+
+  uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+  needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+  is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter6_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+    // Filter6() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f6_p1q1 = zero;
+    f6_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+
+  // The six tap filter is only six taps on input. Output is limited to p1-q1.
+  dst += 1;
+  // Put things back in order to reverse the transpose.
+  const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+  uint8x8_t output_0 = p1p0xq1q0.val[0];
+  uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+  Transpose4x4(&output_0, &output_1);
+
+  StoreLo4(dst, output_0);
+  StoreLo4(dst + stride, output_1);
+  StoreHi4(dst + 2 * stride, output_0);
+  StoreHi4(dst + 3 * stride, output_1);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
+                         const uint8x8_t abd_p0n1_q0n1,
+                         const uint8x8_t abd_p0n2_q0n2) {
+  const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
+  const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
+  const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
+  return vand_u8(c, RightShiftVector<32>(c));
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   OuterThreshold()
+inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t abd_p1p2_q1q2,
+                              const uint8x8_t abd_p2p3_q2q3,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
+  const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2,
+                         const uint8x8_t p1q1, const uint8x8_t p0q0,
+                         const uint8_t hev_thresh, const uint8_t outer_thresh,
+                         const uint8_t inner_thresh,
+                         uint8x8_t* const needs_filter8_mask,
+                         uint8x8_t* const is_flat4_mask,
+                         uint8x8_t* const hev_mask) {
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+  *is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3));
+  *needs_filter8_mask =
+      NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0,
+                   p1q1, inner_thresh, outer_thresh);
+}
+
+inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2,
+                    const uint8x8_t p1q1, const uint8x8_t p0q0,
+                    uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+                    uint8x8_t* const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddl_u8(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddl_u8(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddw_u8(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p2q2_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(sum, vaddl_u8(p1q1, q1p1));
+
+  *p1q1_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1));
+  const uint8x8_t q2p2 = Transpose32(p2q2);
+  sum = vaddq_u16(sum, vaddl_u8(p0q0, q2p2));
+
+  *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p3_v = Load4(dst - 4 * stride);
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+  const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p2q2 = zero;
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+    const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+    StoreLo4(dst - 3 * stride, p2p2_output);
+    StoreHi4(dst + 2 * stride, p2p2_output);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  // Move |dst| to the left side of the filter window.
+  dst -= 4;
+
+  // |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will
+  // contain after the transpose.
+  uint8x8_t p3q0 = vld1_u8(dst);
+  uint8x8_t p2q1 = vld1_u8(dst + stride);
+  uint8x8_t p1q2 = vld1_u8(dst + 2 * stride);
+  uint8x8_t p0q3 = vld1_u8(dst + 3 * stride);
+
+  Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+  const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3));
+  const uint8x8_t p3q3 = p3q3xq0p0.val[0];
+  const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]);
+  const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+  const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+  const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p2q2 = zero;
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  // Always prepare and store p2/q2 because we need to transpose it anyway.
+  const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+  // Write out p3/q3 as well. There isn't a good way to write out 6 bytes.
+  // Variable names reflect the values before transposition.
+  const uint8x8x2_t p3q0xq3p0_output =
+      Interleave32(p3q3, Transpose32(p0q0_output));
+  uint8x8_t p3q0_output = p3q0xq3p0_output.val[0];
+  uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]);
+  const uint8x8x2_t p2q1xq2p1_output =
+      Interleave32(p2q2_output, Transpose32(p1q1_output));
+  uint8x8_t p2q1_output = p2q1xq2p1_output.val[0];
+  uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]);
+
+  Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output);
+
+  vst1_u8(dst, p3q0_output);
+  vst1_u8(dst + stride, p2q1_output);
+  vst1_u8(dst + 2 * stride, p1q2_output);
+  vst1_u8(dst + 3 * stride, p0q3_output);
+}
+
+inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5,
+                     const uint8x8_t p4q4, const uint8x8_t p3q3,
+                     const uint8x8_t p2q2, const uint8x8_t p1q1,
+                     const uint8x8_t p0q0, uint8x8_t* const p5q5_output,
+                     uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output,
+                     uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+                     uint8x8_t* const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                          ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                            ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p5q5_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6));
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5));
+  const uint8x8_t q2p2 = Transpose32(p2q2);
+  sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4));
+  const uint8x8_t q3p3 = Transpose32(p3q3);
+  sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3));
+  const uint8x8_t q4p4 = Transpose32(p4q4);
+  sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2));
+  const uint8x8_t q5p5 = Transpose32(p5q5);
+  sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrn_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+                       const int outer_thresh, const int inner_thresh,
+                       const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p6_v = Load4(dst - 7 * stride);
+  const uint8x8_t p5_v = Load4(dst - 6 * stride);
+  const uint8x8_t p4_v = Load4(dst - 5 * stride);
+  const uint8x8_t p3_v = Load4(dst - 4 * stride);
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+  const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+  const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v);
+  const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v);
+  const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Decide between Filter8() and Filter14().
+  uint8x8_t is_flat_outer4_mask =
+      IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+  is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+  is_flat_outer4_mask =
+      InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p1q1, f8_p0q0;
+  uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() and Filter14() do not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+    f14_p1q1 = zero;
+    f14_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    uint8x8_t f8_p2q2;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+    if (vaddv_u8(is_flat_outer4_mask) == 0) {
+      // Filter14() does not apply.
+      const uint8x8_t zero = vdup_n_u8(0);
+      f14_p2q2 = zero;
+      f14_p1q1 = zero;
+      f14_p0q0 = zero;
+    } else {
+#endif  // defined(__aarch64__)
+      uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+      const uint8x8_t p5q5_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+      StoreLo4(dst - 6 * stride, p5q5_output);
+      StoreHi4(dst + 5 * stride, p5q5_output);
+
+      const uint8x8_t p4q4_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+      StoreLo4(dst - 5 * stride, p4q4_output);
+      StoreHi4(dst + 4 * stride, p4q4_output);
+
+      const uint8x8_t p3q3_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+      StoreLo4(dst - 4 * stride, p3q3_output);
+      StoreHi4(dst + 3 * stride, p3q3_output);
+#if defined(__aarch64__)
+    }
+#endif  // defined(__aarch64__)
+
+    uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+    p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+    StoreLo4(dst - 3 * stride, p2q2_output);
+    StoreHi4(dst + 2 * stride, p2q2_output);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+  p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+  p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
+                     const int outer_thresh, const int inner_thresh,
+                     const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  dst -= 8;
+  // input
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  const uint8x16_t x0 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x1 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x2 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x3 = vld1q_u8(dst);
+  dst -= (stride * 3);
+
+  // re-order input
+#if defined(__aarch64__)
+  const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+  const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+  const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4);
+
+  uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0);
+  uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0);
+  uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0);
+  uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0);
+#else
+  const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+  const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+
+  const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0);
+  const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0);
+  const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0);
+  const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0);
+
+  const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4);
+  const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4);
+  const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4);
+  const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4);
+
+  const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4);
+  const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4);
+  const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4);
+  const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4);
+#endif
+  // input after re-order
+  // p0 p1 p2 p3 q0 q1 q2 q3  p4 p5 p6 p7 q4 q5 q6 q7
+
+  const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1);
+  const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3);
+  const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]),
+                                      vreinterpretq_u16_u8(in23.val[0]));
+  const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]),
+                                      vreinterpretq_u16_u8(in23.val[1]));
+
+  const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0]));
+  const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+  const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1]));
+  const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+  const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0]));
+  const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+  const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1]));
+  const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Decide between Filter8() and Filter14().
+  uint8x8_t is_flat_outer4_mask =
+      IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+  is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+  is_flat_outer4_mask =
+      InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+  uint8x8_t f_p0q0, f_p1q1;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t p1q1_output, p0q0_output;
+  uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output;
+
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() and Filter14() do not apply.
+    p1q1_output = p1q1;
+    p0q0_output = p0q0;
+
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+  } else {
+#endif  // defined(__aarch64__)
+    uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+    if (vaddv_u8(is_flat_outer4_mask) == 0) {
+      // Filter14() does not apply.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = f8_p2q2;
+      p1q1_output = f8_p1q1;
+      p0q0_output = f8_p0q0;
+    } else {
+#endif  // defined(__aarch64__)
+      uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+      p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+      p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+      p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+      p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+      p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+      p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+#if defined(__aarch64__)
+    }
+#endif  // defined(__aarch64__)
+    p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+  const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output);
+  const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6);
+  const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output);
+  const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7);
+
+  const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4),
+                                       vreinterpretq_u16_u8(p2q2_p6q6));
+  const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5),
+                                       vreinterpretq_u16_u8(p3q3_p7q7));
+  const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]),
+                                      vreinterpretq_u8_u16(out13.val[0]));
+  const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]),
+                                      vreinterpretq_u8_u16(out13.val[1]));
+
+#if defined(__aarch64__)
+  const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+  const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+  const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0);
+
+  const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7);
+  const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7);
+  const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7);
+  const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7);
+#else
+  const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+  const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+
+  const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0);
+  const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0);
+  const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0);
+  const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0);
+
+  const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0);
+  const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0);
+  const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0);
+  const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0);
+
+  const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0);
+  const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0);
+  const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0);
+  const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0);
+#endif
+
+  vst1q_u8(dst, output_0);
+  dst += stride;
+  vst1q_u8(dst, output_1);
+  dst += stride;
+  vst1q_u8(dst, output_2);
+  dst += stride;
+  vst1q_u8(dst, output_3);
+}
+
+}  // namespace
+
+void LoopFilterInit_NEON() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Horizontal4_NEON;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Horizontal6_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Horizontal8_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Vertical14_NEON;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h
new file mode 100644 (file)
index 0000000..531cd0d
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_NEON();
+void LoopFilterInit10bpp_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
diff --git a/src/dsp/arm/loop_restoration_10bit_neon.cc b/src/dsp/arm/loop_restoration_10bit_neon.cc
new file mode 100644 (file)
index 0000000..9191080
--- /dev/null
@@ -0,0 +1,2652 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+    const RestorationUnitInfo& restoration_info, const int direction,
+    int16_t filter[4]) {
+  for (int i = 0; i < 4; ++i) {
+    filter[i] = restoration_info.wiener_info.filter[direction][i];
+  }
+}
+
+inline int32x4x2_t WienerHorizontal2(const uint16x8_t s0, const uint16x8_t s1,
+                                     const int16_t filter,
+                                     const int32x4x2_t sum) {
+  const int16x8_t ss = vreinterpretq_s16_u16(vaddq_u16(s0, s1));
+  int32x4x2_t res;
+  res.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(ss), filter);
+  res.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(ss), filter);
+  return res;
+}
+
+inline void WienerHorizontalSum(const uint16x8_t s[3], const int16_t filter[4],
+                                int32x4x2_t sum, int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (kBitdepth10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[2]));
+  const int16x8_t s_1 = vreinterpretq_s16_u16(s[1]);
+  int16x4x2_t sum16;
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_0_2), filter[2]);
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_1), filter[3]);
+  sum16.val[0] = vqshrn_n_s32(sum.val[0], kInterRoundBitsHorizontal);
+  sum16.val[0] = vmax_s16(sum16.val[0], vdup_n_s16(-offset));
+  sum16.val[0] = vmin_s16(sum16.val[0], vdup_n_s16(limit - offset));
+  vst1_s16(wiener_buffer, sum16.val[0]);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_0_2), filter[2]);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_1), filter[3]);
+  sum16.val[1] = vqshrn_n_s32(sum.val[1], kInterRoundBitsHorizontal);
+  sum16.val[1] = vmax_s16(sum16.val[1], vdup_n_s16(-offset));
+  sum16.val[1] = vmin_s16(sum16.val[1], vdup_n_s16(limit - offset));
+  vst1_s16(wiener_buffer + 4, sum16.val[1]);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t wiener_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  const ptrdiff_t src_width =
+      width + ((kRestorationHorizontalBorder - 1) * sizeof(*src));
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[8];
+    s[0] = vld1q_u16(src_ptr);
+    ptrdiff_t x = wiener_stride;
+    ptrdiff_t valid_bytes = src_width * 2;
+    do {
+      src_ptr += 8;
+      valid_bytes -= 16;
+      s[7] = Load1QMsanU16(src_ptr, 16 - valid_bytes);
+      s[1] = vextq_u16(s[0], s[7], 1);
+      s[2] = vextq_u16(s[0], s[7], 2);
+      s[3] = vextq_u16(s[0], s[7], 3);
+      s[4] = vextq_u16(s[0], s[7], 4);
+      s[5] = vextq_u16(s[0], s[7], 5);
+      s[6] = vextq_u16(s[0], s[7], 6);
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+      sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+      WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+      s[0] = s[7];
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t wiener_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  const ptrdiff_t src_width =
+      width + ((kRestorationHorizontalBorder - 1) * sizeof(*src));
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[6];
+    s[0] = vld1q_u16(src_ptr);
+    ptrdiff_t x = wiener_stride;
+    ptrdiff_t valid_bytes = src_width * 2;
+    do {
+      src_ptr += 8;
+      valid_bytes -= 16;
+      s[5] = Load1QMsanU16(src_ptr, 16 - valid_bytes);
+      s[1] = vextq_u16(s[0], s[5], 1);
+      s[2] = vextq_u16(s[0], s[5], 2);
+      s[3] = vextq_u16(s[0], s[5], 3);
+      s[4] = vextq_u16(s[0], s[5], 4);
+
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+      WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+      s[0] = s[5];
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[3];
+    ptrdiff_t x = width;
+    do {
+      s[0] = vld1q_u16(src_ptr);
+      s[1] = vld1q_u16(src_ptr + 1);
+      s[2] = vld1q_u16(src_ptr + 2);
+
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+      src_ptr += 8;
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src + x);
+      const int16x8_t d = vreinterpretq_s16_u16(vshlq_n_u16(s, 4));
+      vst1q_s16(*wiener_buffer + x, d);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+                                   const int16_t filter,
+                                   const int32x4x2_t sum) {
+  int32x4x2_t d;
+  d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a0), filter);
+  d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a0), filter);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a1), filter);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a1), filter);
+  return d;
+}
+
+inline uint16x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+                                 const int32x4x2_t sum) {
+  int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+  const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+  const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+  return vcombine_u16(sum_lo_16, sum_hi_16);
+}
+
+inline uint16x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[7]) {
+  int32x4x2_t sum;
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[6], filter[0], sum);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap7Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[8];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[7], filter[0], sum);
+  sum = WienerVertical2(a[2], a[6], filter[1], sum);
+  d.val[1] = WienerVertical(a + 3, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[7];
+      const uint16x8_t d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint16x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[5]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[4], filter[1], sum);
+  return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap5Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[6];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  d.val[1] = WienerVertical(a + 2, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[5];
+      const uint16x8_t d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint16x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[3]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  return WienerVertical(a, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap3Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[4];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  d.val[1] = WienerVertical(a + 1, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[3];
+      const uint16x8_t d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+  const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+  const int16x8_t d0 = vrshrq_n_s16(a0, 4);
+  const int16x8_t d1 = vrshrq_n_s16(a1, 4);
+  vst1q_u16(dst, vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d0, vdupq_n_s16(0))),
+                           v_max_bitdepth));
+  vst1q_u16(dst + 8,
+            vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d1, vdupq_n_s16(0))),
+                      v_max_bitdepth));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+      WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst);
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+  int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+  int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+                             filter_horizontal);
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+                             filter_vertical);
+  // horizontal filtering.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, width, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, width, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, width,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, width, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, width, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, width,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               uint16x8_t dst[2]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint16x8_t dst[2]) {
+  dst[0] = Load1QMsanU16(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = Load1QMsanU16(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               uint16x8_t dst[3]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+  dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint16x8_t dst[3]) {
+  dst[0] = Load1QMsanU16(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = Load1QMsanU16(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = Load1QMsanU16(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4_t dst[2]) {
+  dst[0] = vld1q_u32(src + 0);
+  dst[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, uint32x4_t dst[2]) {
+  dst[0] = Load1QMsanU32(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = Load1QMsanU32(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               uint32x4_t dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint32x4_t dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               uint32x4_t dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint32x4_t dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+  vst1q_u16(dst + 0, src[0]);
+  vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4_t src[2]) {
+  vst1q_u32(dst + 0, src[0]);
+  vst1q_u32(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4_t src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+inline uint16x8_t VaddwLo8(const uint16x8_t src0, const uint8x16_t src1) {
+  const uint8x8_t s1 = vget_low_u8(src1);
+  return vaddw_u8(src0, s1);
+}
+
+inline uint16x8_t VaddwHi8(const uint16x8_t src0, const uint8x16_t src1) {
+  const uint8x8_t s1 = vget_high_u8(src1);
+  return vaddw_u8(src0, s1);
+}
+
+inline uint32x4_t VmullLo16(const uint16x8_t src0, const uint16x8_t src1) {
+  return vmull_u16(vget_low_u16(src0), vget_low_u16(src1));
+}
+
+inline uint32x4_t VmullHi16(const uint16x8_t src0, const uint16x8_t src1) {
+  return vmull_u16(vget_high_u16(src0), vget_high_u16(src1));
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+  return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+  return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
+inline uint32x4_t Square(uint16x4_t s) { return vmull_u16(s, s); }
+
+inline void Square(const uint16x8_t src, uint32x4_t dst[2]) {
+  const uint16x4_t s_lo = vget_low_u16(src);
+  const uint16x4_t s_hi = vget_high_u16(src);
+  dst[0] = Square(s_lo);
+  dst[1] = Square(s_hi);
+}
+
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x8_t dst[3]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u16(src[0], src[1], 1);
+  dst[2] = vextq_u16(src[0], src[1], 2);
+}
+
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x8_t dst[5]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u16(src[0], src[1], 1);
+  dst[2] = vextq_u16(src[0], src[1], 2);
+  dst[3] = vextq_u16(src[0], src[1], 3);
+  dst[4] = vextq_u16(src[0], src[1], 4);
+}
+
+inline void Prepare3_32(const uint32x4_t src[2], uint32x4_t dst[3]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u32(src[0], src[1], 1);
+  dst[2] = vextq_u32(src[0], src[1], 2);
+}
+
+inline void Prepare5_32(const uint32x4_t src[2], uint32x4_t dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = vextq_u32(src[0], src[1], 3);
+  dst[4] = src[1];
+}
+
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+                          const uint16x8_t src2) {
+  const uint16x8_t sum = vaddq_u16(src0, src1);
+  return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2) {
+  const uint32x4_t sum = vaddq_u32(src0, src1);
+  return vaddq_u32(sum, src2);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const uint32x4_t src[3][2], uint32x4_t dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+  const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+  const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t* src0, const uint32x4_t* src1,
+                          const uint32x4_t* src2, const uint32x4_t* src3,
+                          const uint32x4_t* src4) {
+  const uint32x4_t sum01 = vaddq_u32(*src0, *src1);
+  const uint32x4_t sum23 = vaddq_u32(*src2, *src3);
+  const uint32x4_t sum = vaddq_u32(sum01, sum23);
+  return vaddq_u32(sum, *src4);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const uint32x4_t src[5][2], uint32x4_t dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline uint16x8_t Sum3Horizontal16(const uint16x8_t src[2]) {
+  uint16x8_t s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline uint16x8_t Sum5Horizontal16(const uint16x8_t src[2]) {
+  uint16x8_t s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const uint16x8_t src[2], uint16x8_t* const row3,
+                     uint16x8_t* const row5) {
+  uint16x8_t s[5];
+  Prepare5_16(src, s);
+  const uint16x8_t sum04 = vaddq_u16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = vaddq_u16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16x8_t src[3], uint16x8_t* const row3_0,
+                            uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                            uint16x8_t* const row5_1) {
+  SumHorizontal16(src + 0, row3_0, row5_0);
+  SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const uint32x4_t src[5], uint32x4_t* const row_sq3,
+                     uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddq_u32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const uint32x4_t src[3],
+                            uint32x4_t* const row_sq3_0,
+                            uint32x4_t* const row_sq3_1,
+                            uint32x4_t* const row_sq5_0,
+                            uint32x4_t* const row_sq5_1) {
+  uint32x4_t s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline uint16x8_t Sum343Lo(const uint8x16_t ma3[3]) {
+  const uint16x8_t sum = Sum3WLo16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline uint16x8_t Sum343Hi(const uint8x16_t ma3[3]) {
+  const uint16x8_t sum = Sum3WHi16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline uint32x4_t Sum343(const uint32x4_t src[3]) {
+  const uint32x4_t sum = Sum3_32(src);
+  const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+  return vaddq_u32(sum3, src[1]);
+}
+
+inline void Sum343(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline uint16x8_t Sum565Lo(const uint8x16_t src[3]) {
+  const uint16x8_t sum = Sum3WLo16(src);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline uint16x8_t Sum565Hi(const uint8x16_t src[3]) {
+  const uint16x8_t sum = Sum3WHi16(src);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline uint32x4_t Sum565(const uint32x4_t src[3]) {
+  const uint32x4_t sum = Sum3_32(src);
+  const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+  const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+  return vaddq_u32(sum5, src[1]);
+}
+
+inline void Sum565(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    uint16x8_t s[3];
+    uint32x4_t sq[6];
+    s[0] = Load1QMsanU16(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row3[2], row5[2];
+      uint32x4_t row_sq3[2], row_sq5[2];
+      s[1] = Load1QMsanU16(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = Load1QMsanU16(src,
+                           overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  do {
+    uint16x8_t s[3];
+    uint32x4_t sq[6];
+    s[0] = Load1QMsanU16(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row[2];
+      uint32x4_t row_sq[4];
+      s[1] = Load1QMsanU16(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = Load1QMsanU16(src,
+                           overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(s + 0);
+        row[1] = Sum3Horizontal16(s + 1);
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 2, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(s + 0);
+        row[1] = Sum5Horizontal16(s + 1);
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 2, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+                              const uint32_t scale) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const uint32x4_t dxd = vmull_u16(sum, sum);
+  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+  // Ensure |p| does not underflow by using saturating subtraction.
+  const uint32x4_t p = vqsubq_u32(axn, dxd);
+  const uint32x4_t pxs = vmulq_n_u32(p, scale);
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
+  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+  return vmovn_u32(shifted);
+}
+
+template <int n>
+inline uint16x8_t CalculateMa(const uint16x8_t sum, const uint32x4_t sum_sq[2],
+                              const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const uint16x8_t b = vrshrq_n_u16(sum, 2);
+  const uint16x4_t sum_lo = vget_low_u16(b);
+  const uint16x4_t sum_hi = vget_high_u16(b);
+  const uint16x4_t z0 =
+      CalculateMa<n>(sum_lo, vrshrq_n_u32(sum_sq[0], 4), scale);
+  const uint16x4_t z1 =
+      CalculateMa<n>(sum_hi, vrshrq_n_u32(sum_sq[1], 4), scale);
+  return vcombine_u16(z0, z1);
+}
+
+inline void CalculateB5(const uint16x8_t sum, const uint16x8_t ma,
+                        uint32x4_t b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const uint32x4_t m2 = VmullLo16(ma, sum);
+  const uint32x4_t m3 = VmullHi16(ma, sum);
+  const uint32x4_t m0 = vmulq_n_u32(m2, one_over_n_quarter);
+  const uint32x4_t m1 = vmulq_n_u32(m3, one_over_n_quarter);
+  b[0] = vrshrq_n_u32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = vrshrq_n_u32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const uint16x8_t sum, const uint16x8_t ma,
+                        uint32x4_t b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const uint32x4_t m0 = VmullLo16(ma, sum);
+  const uint32x4_t m1 = VmullHi16(ma, sum);
+  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+  b[0] = vrshrq_n_u32(m2, kSgrProjReciprocalBits);
+  b[1] = vrshrq_n_u32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex3(const uint16x8_t s3[3],
+                                  const uint32x4_t sq3[3][2],
+                                  const uint32_t scale, uint16x8_t* const sum,
+                                  uint16x8_t* const index) {
+  uint32x4_t sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const uint16x8_t s5[5],
+                                  const uint32x4_t sq5[5][2],
+                                  const uint32_t scale, uint16x8_t* const sum,
+                                  uint16x8_t* const index) {
+  uint32x4_t sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const uint16x8_t sum, const uint16x8_t index,
+                               uint8x16_t* const ma, uint32x4_t b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+
+  const uint8x8_t idx = vqmovn_u16(index);
+  uint8_t temp[8];
+  vst1_u8(temp, idx);
+  // offset == 0 is assumed to be the first call to this function. The value is
+  // duplicated to avoid -Wuninitialized warnings under gcc.
+  if (offset == 0) {
+    *ma = vdupq_n_u8(kSgrMaLookup[temp[0]]);
+  } else {
+    *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+  }
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[1]], *ma, offset + 1);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[2]], *ma, offset + 2);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[3]], *ma, offset + 3);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[4]], *ma, offset + 4);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[5]], *ma, offset + 5);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[6]], *ma, offset + 6);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[7]], *ma, offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+inline uint8x8_t MaLookupAndAdjust(const uint8x8x4_t table0,
+                                   const uint8x8x2_t table1,
+                                   const uint16x8_t index) {
+  const uint8x8_t idx = vqmovn_u16(index);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t sub_idx = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, sub_idx);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  return val;
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+                                  const uint16x8_t index[2],
+                                  uint8x16_t* const ma, uint32x4_t b0[2],
+                                  uint32x4_t b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  const uint8x8_t ma_lo = MaLookupAndAdjust(table0, table1, index[0]);
+  const uint8x8_t ma_hi = MaLookupAndAdjust(table0, table1, index[1]);
+  *ma = vcombine_u8(ma_lo, ma_hi);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq0 = vmovl_u8(vget_low_u8(*ma));
+  CalculateB3(sum[0], maq0, b0);
+  const uint16x8_t maq1 = vmovl_u8(vget_high_u8(*ma));
+  CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+                                  const uint16x8_t index[2], uint8x16_t ma[2],
+                                  uint32x4_t b[4]) {
+  uint8x16_t mas;
+  CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+  ma[0] = vcombine_u8(vget_low_u8(ma[0]), vget_low_u8(mas));
+  ma[1] = vextq_u8(mas, vdupq_n_u8(0), 8);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+                                   const uint32x4_t sq5[5][2],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint32x4_t b[2]) {
+  static_assert(offset == 0 || offset == 8, "");
+  uint16x8_t sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+                                   const uint32x4_t sq3[3][2],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint32x4_t b[2]) {
+  uint16x8_t sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const uint32x4_t b3[3], const ptrdiff_t x,
+                         uint32x4_t sum_b343[2], uint32x4_t sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  uint32x4_t b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = vshlq_n_u32(sum_b111[0], 2);
+  sum_b343[0] = vsubq_u32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = vaddq_u32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = vshlq_n_u32(sum_b111[1], 2);
+  sum_b343[1] = vsubq_u32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = vaddq_u32(sum_b343[1], b[1]);
+  StoreAligned32U32(b444 + x, sum_b444);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[3],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+                           uint32x4_t sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+                           uint32x4_t sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint32x4_t sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4_t sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint32x4_t sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4_t sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4_t sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4_t sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const uint16x8_t s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t* const ma,
+    uint32x4_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  vst1q_u16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  vst1q_u16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16x8_t s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  s5[0][3] = Sum5Horizontal16(s[0] + 1);
+  s5[1][3] = Sum5Horizontal16(s[0] + 2);
+  vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  s5[0][4] = Sum5Horizontal16(s[1] + 1);
+  s5[1][4] = Sum5Horizontal16(s[1] + 2);
+  vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  Sum5Horizontal32(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const uint16x8_t s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[4],
+    uint8x16_t* const ma, uint32x4_t b[2]) {
+  uint16x8_t s5[5];
+  uint32x4_t sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint16x8_t s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[2], sq + 4);
+  s5[0][3] = Sum5Horizontal16(s + 1);
+  s5[1][3] = Sum5Horizontal16(s + 2);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[3], sq + 6);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const uint16x8_t s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint32x4_t sq[4], uint8x16_t* const ma,
+    uint32x4_t b[2]) {
+  uint16x8_t s3[3];
+  uint32x4_t sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  vst1q_u16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16x8_t s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint32x4_t sq[8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s3[4], sum[2], index[2];
+  uint32x4_t sq3[3][2];
+
+  Square(s[2], sq + 4);
+  s3[2] = Sum3Horizontal16(s + 1);
+  s3[3] = Sum3Horizontal16(s + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const uint16x8_t s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+    uint32x4_t b3[2][6], uint8x16_t* const ma5, uint32x4_t b5[2]) {
+  uint16x8_t s3[4], s5[5], sum[2], index[2];
+  uint32x4_t sq3[4][2], sq5[5][2];
+
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = vextq_u8(ma3[0][0], vdupq_n_u8(0), 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint16x8_t s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+    uint32x4_t b3[2][6], uint8x16_t ma5[2], uint32x4_t b5[6]) {
+  uint16x8_t s3[2][4], s5[2][5], sum[2][2], index[2][2];
+  uint32x4_t sq3[4][2], sq5[5][2];
+
+  SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  vst1q_u16(sum3[2] + x + 0, s3[0][2]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  vst1q_u16(sum3[3] + x + 0, s3[0][3]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const uint16x8_t s[2], const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint32x4_t sq[4], uint8x16_t* const ma3, uint8x16_t* const ma5,
+    uint32x4_t b3[2], uint32x4_t b5[2]) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4_t sq3[3][2], sq5[5][2];
+
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint16x8_t s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma3[2],
+    uint8x16_t ma5[2], uint32x4_t b3[6], uint32x4_t b5[6]) {
+  uint16x8_t s3[2][3], s5[2][5], sum[2], index[2];
+  uint32x4_t sq3[3][2], sq5[5][2];
+
+  Square(s[2], sq + 4);
+  SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[2][8], bs[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint8x16_t ma5[3];
+    uint16x8_t ma[2];
+    uint32x4_t b[4];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src) * width;
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = Load1QMsanU16(src + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = Load1QMsanU16(src + x + 16,
+                         overread_in_bytes + sizeof(*src) * (x + 16));
+    s[3] = Load1QMsanU16(src + x + 24,
+                         overread_in_bytes + sizeof(*src) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    uint8x16_t ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      uint16x8_t ma[2];
+      uint32x4_t b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 2, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t ma3[2][2], ma5[2];
+  uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[4];
+    uint8x16_t ma3x[3], ma5x[3];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343(b3[0] + 0, b + 0);
+    Sum343(b3[0] + 2, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565(b5 + 0, b + 0);
+    Sum565(b5 + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint32x4_t ma_x_src, const uint32x4_t b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const int32x4_t v = vreinterpretq_s32_u32(vsubq_u32(b, ma_x_src));
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return vqrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint16x8_t src,
+                                         const uint16x8_t ma,
+                                         const uint32x4_t b[2]) {
+  const uint32x4_t ma_x_src_lo = VmullLo16(ma, src);
+  const uint32x4_t ma_x_src_hi = VmullHi16(ma, src);
+  const int16x4_t dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const int16x4_t dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return vcombine_s16(dst_lo, dst_hi);  // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint16x8_t src,
+                                              const uint16x8_t ma[2],
+                                              const uint32x4_t b[2][2]) {
+  const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+  uint32x4_t b_sum[2];
+  b_sum[0] = vaddq_u32(b[0][0], b[1][0]);
+  b_sum[1] = vaddq_u32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint16x8_t src,
+                                              const uint16x8_t ma[3],
+                                              const uint32x4_t b[3][2]) {
+  const uint16x8_t ma_sum = Sum3_16(ma);
+  uint32x4_t b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t SelfGuidedFinal(const uint16x8_t src, const int32x4_t v[2]) {
+  const int16x4_t v_lo =
+      vqrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t v_hi =
+      vqrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+  return vaddq_s16(vreinterpretq_s16_u16(src), vv);
+}
+
+inline int16x8_t SelfGuidedDoubleMultiplier(const uint16x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+  v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+  v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+  return SelfGuidedFinal(src, v);
+}
+
+inline int16x8_t SelfGuidedSingleMultiplier(const uint16x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const int16x8_t val) {
+  const uint16x8_t val0 = vreinterpretq_u16_s16(vmaxq_s16(val, vdupq_n_s16(0)));
+  const uint16x8_t val1 = vminq_u16(val0, vdupq_n_u16((1 << kBitdepth10) - 1));
+  vst1q_u16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[2][8], bs[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[2][2];
+    uint8x16_t ma5[3];
+    int16x8_t p[2];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    vst1q_u16(ma565[1] + x, ma[1]);
+    Sum565(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    const uint16x8_t sr0_lo = vld1q_u16(src + x + 0);
+    const uint16x8_t sr1_lo = vld1q_u16(src + stride + x + 0);
+    ma[0] = vld1q_u16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const int16x8_t d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const int16x8_t d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    Sum565(bs + 2, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const uint16x8_t sr0_hi = vld1q_u16(src + x + 8);
+    const uint16x8_t sr1_hi = vld1q_u16(src + stride + x + 8);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const int16x8_t d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    const int16x8_t d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[2][2];
+    uint8x16_t ma5[3];
+
+    s[2] = Load1QMsanU16(src0 + x + 16,
+                         overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = Load1QMsanU16(src0 + x + 24,
+                         overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565(bs, b[1]);
+    ma[0] = vld1q_u16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    int16x8_t p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565(bs + 2, b[1]);
+    ma[0] = vld1q_u16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src0) * width;
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = Load1QMsanU16(src0 + x + 16,
+                         overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = Load1QMsanU16(src0 + x + 24,
+                         overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    uint16x8_t ma[3];
+    uint32x4_t b[3][2];
+    uint8x16_t ma3[3];
+
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    ma[0] = vld1q_u16(ma343[0] + x);
+    ma[1] = vld1q_u16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t ma3[2][2], ma5[2];
+  uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[3][3];
+    uint32x4_t b[3][3][2];
+    uint8x16_t ma3x[2][3], ma5x[3];
+    int16x8_t p[2][2];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    vst1q_u16(ma565[1] + x, ma[0][1]);
+    Sum565(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const uint16x8_t sr0_lo = vld1q_u16(src + x);
+    const uint16x8_t sr1_lo = vld1q_u16(src + stride + x);
+    ma[0][0] = vld1q_u16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x);
+    ma[1][1] = vld1q_u16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const int16x8_t d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = vld1q_u16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const int16x8_t d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    Sum565(b5 + 2, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const uint16x8_t sr0_hi = Load1QMsanU16(
+        src + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+    const uint16x8_t sr1_hi = Load1QMsanU16(
+        src + stride + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const int16x8_t d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const int16x8_t d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[4];
+  uint8x16_t ma3[2], ma5[2];
+  uint32x4_t sq[8], b3[6], b5[6];
+  uint16x8_t ma[3];
+  uint32x4_t b[3][2];
+
+  s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], b3, b5);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
+
+    s[2] = Load1QMsanU16(src0 + x + 16,
+                         overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = Load1QMsanU16(src0 + x + 24,
+                         overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    ma[0] = vld1q_u16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const int16x8_t d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 2, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 2, b[2]);
+    const uint16x8_t sr_hi = Load1QMsanU16(
+        src + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const int16x8_t d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[4];
+    b3[1] = b3[5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* top = static_cast<const uint16_t*>(top_border);
+  const auto* bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->loop_restorations[0] = WienerFilter_NEON;
+  dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
new file mode 100644 (file)
index 0000000..adb8f36
--- /dev/null
@@ -0,0 +1,2420 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+  return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+  return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+    const RestorationUnitInfo& restoration_info, const int direction,
+    int16_t filter[4]) {
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  for (int i = 0; i < 4; ++i) {
+    filter[i] = restoration_info.wiener_info.filter[direction][i];
+  }
+  if (direction == WienerInfo::kHorizontal) {
+    filter[3] -= 128;
+  }
+}
+
+inline int16x8_t WienerHorizontal2(const uint8x8_t s0, const uint8x8_t s1,
+                                   const int16_t filter, const int16x8_t sum) {
+  const int16x8_t ss = vreinterpretq_s16_u16(vaddl_u8(s0, s1));
+  return vmlaq_n_s16(sum, ss, filter);
+}
+
+inline int16x8x2_t WienerHorizontal2(const uint8x16_t s0, const uint8x16_t s1,
+                                     const int16_t filter,
+                                     const int16x8x2_t sum) {
+  int16x8x2_t d;
+  d.val[0] =
+      WienerHorizontal2(vget_low_u8(s0), vget_low_u8(s1), filter, sum.val[0]);
+  d.val[1] =
+      WienerHorizontal2(vget_high_u8(s0), vget_high_u8(s1), filter, sum.val[1]);
+  return d;
+}
+
+inline void WienerHorizontalSum(const uint8x8_t s[3], const int16_t filter[4],
+                                int16x8_t sum, int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[2]));
+  const int16x8_t s_1 = ZeroExtend(s[1]);
+  sum = vmlaq_n_s16(sum, s_0_2, filter[2]);
+  sum = vmlaq_n_s16(sum, s_1, filter[3]);
+  // Calculate scaled down offset correction, and add to sum here to prevent
+  // signed 16 bit outranging.
+  sum = vrsraq_n_s16(vshlq_n_s16(s_1, 7 - kInterRoundBitsHorizontal), sum,
+                     kInterRoundBitsHorizontal);
+  sum = vmaxq_s16(sum, vdupq_n_s16(-offset));
+  sum = vminq_s16(sum, vdupq_n_s16(limit - offset));
+  vst1q_s16(wiener_buffer, sum);
+}
+
+inline void WienerHorizontalSum(const uint8x16_t src[3],
+                                const int16_t filter[4], int16x8x2_t sum,
+                                int16_t* const wiener_buffer) {
+  uint8x8_t s[3];
+  s[0] = vget_low_u8(src[0]);
+  s[1] = vget_low_u8(src[1]);
+  s[2] = vget_low_u8(src[2]);
+  WienerHorizontalSum(s, filter, sum.val[0], wiener_buffer);
+  s[0] = vget_high_u8(src[0]);
+  s[1] = vget_high_u8(src[1]);
+  s[2] = vget_high_u8(src[2]);
+  WienerHorizontalSum(s, filter, sum.val[1], wiener_buffer + 8);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[8];
+    s[0] = vld1q_u8(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 16;
+      s[7] = vld1q_u8(src_ptr);
+      s[1] = vextq_u8(s[0], s[7], 1);
+      s[2] = vextq_u8(s[0], s[7], 2);
+      s[3] = vextq_u8(s[0], s[7], 3);
+      s[4] = vextq_u8(s[0], s[7], 4);
+      s[5] = vextq_u8(s[0], s[7], 5);
+      s[6] = vextq_u8(s[0], s[7], 6);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+      sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+      WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+      s[0] = s[7];
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[6];
+    s[0] = vld1q_u8(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 16;
+      s[5] = vld1q_u8(src_ptr);
+      s[1] = vextq_u8(s[0], s[5], 1);
+      s[2] = vextq_u8(s[0], s[5], 2);
+      s[3] = vextq_u8(s[0], s[5], 3);
+      s[4] = vextq_u8(s[0], s[5], 4);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+      WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+      s[0] = s[5];
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[3];
+    ptrdiff_t x = width;
+    do {
+      // Slightly faster than using vextq_u8().
+      s[0] = vld1q_u8(src_ptr);
+      s[1] = vld1q_u8(src_ptr + 1);
+      s[2] = vld1q_u8(src_ptr + 2);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+      src_ptr += 16;
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    ptrdiff_t x = width;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr);
+      const uint8x8_t s0 = vget_low_u8(s);
+      const uint8x8_t s1 = vget_high_u8(s);
+      const int16x8_t d0 = vreinterpretq_s16_u16(vshll_n_u8(s0, 4));
+      const int16x8_t d1 = vreinterpretq_s16_u16(vshll_n_u8(s1, 4));
+      vst1q_s16(*wiener_buffer + 0, d0);
+      vst1q_s16(*wiener_buffer + 8, d1);
+      src_ptr += 16;
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+                                   const int16_t filter,
+                                   const int32x4x2_t sum) {
+  const int16x8_t a = vaddq_s16(a0, a1);
+  int32x4x2_t d;
+  d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a), filter);
+  d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a), filter);
+  return d;
+}
+
+inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+                                const int32x4x2_t sum) {
+  int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+  const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+  const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+  return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16));
+}
+
+inline uint8x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[7]) {
+  int32x4x2_t sum;
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[6], filter[0], sum);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap7Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[8];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[7], filter[0], sum);
+  sum = WienerVertical2(a[2], a[6], filter[1], sum);
+  d.val[1] = WienerVertical(a + 3, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[7];
+      const uint8x8_t d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint8x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[5]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[4], filter[1], sum);
+  return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap5Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[6];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  d.val[1] = WienerVertical(a + 2, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[5];
+      const uint8x8_t d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint8x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[3]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  return WienerVertical(a, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap3Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[4];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  d.val[1] = WienerVertical(a + 1, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[3];
+      const uint8x8_t d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+  const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+  const uint8x8_t d0 = vqrshrun_n_s16(a0, 4);
+  const uint8x8_t d1 = vqrshrun_n_s16(a1, 4);
+  vst1q_u8(dst, vcombine_u8(d0, d1));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+      WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst);
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+  int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+  int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+                             filter_horizontal);
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+                             filter_vertical);
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 2;
+constexpr int kOverreadInBytesPass2 = 4;
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kWideOverreadInBytesPass1 = 10;
+constexpr int kWideOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               uint16x8_t dst[2]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               uint16x8_t dst[3]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+  dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4x2_t* dst) {
+  (*dst).val[0] = vld1q_u32(src + 0);
+  (*dst).val[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               uint32x4x2_t dst[2]) {
+  LoadAligned32U32(src[0] + x, &dst[0]);
+  LoadAligned32U32(src[1] + x, &dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               uint32x4x2_t dst[3]) {
+  LoadAligned32U32(src[0] + x, &dst[0]);
+  LoadAligned32U32(src[1] + x, &dst[1]);
+  LoadAligned32U32(src[2] + x, &dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+  vst1q_u16(dst + 0, src[0]);
+  vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4x2_t src) {
+  vst1q_u32(dst + 0, src.val[0]);
+  vst1q_u32(dst + 4, src.val[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4x2_t src[2]) {
+  vst1q_u32(dst + 0, src[0].val[0]);
+  vst1q_u32(dst + 4, src[0].val[1]);
+  vst1q_u32(dst + 8, src[1].val[0]);
+  vst1q_u32(dst + 12, src[1].val[1]);
+}
+
+inline uint16x8_t SquareLo8(const uint8x8_t src) { return vmull_u8(src, src); }
+
+inline uint16x8_t SquareLo8(const uint8x16_t src) {
+  return vmull_u8(vget_low_u8(src), vget_low_u8(src));
+}
+
+inline uint16x8_t SquareHi8(const uint8x16_t src) {
+  return vmull_u8(vget_high_u8(src), vget_high_u8(src));
+}
+
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
+  dst[0] = VshrU128<0>(src);
+  dst[1] = VshrU128<1>(src);
+  dst[2] = VshrU128<2>(src);
+}
+
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
+                        uint16x4_t high[3]) {
+  uint16x8_t s[3];
+  s[0] = VshrU128<0>(src);
+  s[1] = VshrU128<2>(src);
+  s[2] = VshrU128<4>(src);
+  low[0] = vget_low_u16(s[0]);
+  low[1] = vget_low_u16(s[1]);
+  low[2] = vget_low_u16(s[2]);
+  high[0] = vget_high_u16(s[0]);
+  high[1] = vget_high_u16(s[1]);
+  high[2] = vget_high_u16(s[2]);
+}
+
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
+  dst[0] = VshrU128<0>(src);
+  dst[1] = VshrU128<1>(src);
+  dst[2] = VshrU128<2>(src);
+  dst[3] = VshrU128<3>(src);
+  dst[4] = VshrU128<4>(src);
+}
+
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
+                        uint16x4_t high[5]) {
+  Prepare3_16(src, low, high);
+  const uint16x8_t s3 = VshrU128<6>(src);
+  const uint16x8_t s4 = VshrU128<8>(src);
+  low[3] = vget_low_u16(s3);
+  low[4] = vget_low_u16(s4);
+  high[3] = vget_high_u16(s3);
+  high[4] = vget_high_u16(s4);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+                          const uint16x8_t src2) {
+  const uint16x8_t sum = vaddq_u16(src0, src1);
+  return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2) {
+  const uint32x4_t sum = vaddq_u32(src0, src1);
+  return vaddq_u32(sum, src2);
+}
+
+inline uint32x4x2_t Sum3_32(const uint32x4x2_t src[3]) {
+  uint32x4x2_t d;
+  d.val[0] = Sum3_32(src[0].val[0], src[1].val[0], src[2].val[0]);
+  d.val[1] = Sum3_32(src[0].val[1], src[1].val[1], src[2].val[1]);
+  return d;
+}
+
+inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(src[0], src[1]);
+  return vaddw_u8(sum, src[2]);
+}
+
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
+inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
+  const uint32x4_t sum = vaddl_u16(src[0], src[1]);
+  return vaddw_u16(sum, src[2]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+  const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+  const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2, const uint32x4_t src3,
+                          const uint32x4_t src4) {
+  const uint32x4_t sum01 = vaddq_u32(src0, src1);
+  const uint32x4_t sum23 = vaddq_u32(src2, src3);
+  const uint32x4_t sum = vaddq_u32(sum01, sum23);
+  return vaddq_u32(sum, src4);
+}
+
+inline uint32x4x2_t Sum5_32(const uint32x4x2_t src[5]) {
+  uint32x4x2_t d;
+  d.val[0] = Sum5_32(src[0].val[0], src[1].val[0], src[2].val[0], src[3].val[0],
+                     src[4].val[0]);
+  d.val[1] = Sum5_32(src[0].val[1], src[1].val[1], src[2].val[1], src[3].val[1],
+                     src[4].val[1]);
+  return d;
+}
+
+inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
+  const uint32x4_t sum01 = vaddl_u16(src[0], src[1]);
+  const uint32x4_t sum23 = vaddl_u16(src[2], src[3]);
+  const uint32x4_t sum0123 = vaddq_u32(sum01, sum23);
+  return vaddw_u16(sum0123, src[4]);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
+  uint8x8_t s[3];
+  Prepare3_8(src, s);
+  return Sum3W_16(s);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+  uint8x16_t s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t sum;
+  Prepare3_16(src, low, high);
+  sum.val[0] = Sum3W_32(low);
+  sum.val[1] = Sum3W_32(high);
+  return sum;
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
+  uint8x8_t s[5];
+  Prepare5_8(src, s);
+  const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
+  const uint16x8_t sum23 = vaddl_u8(s[2], s[3]);
+  const uint16x8_t sum0123 = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum0123, s[4]);
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+                           uint16x8_t* const dst1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
+  uint16x4_t low[5], high[5];
+  Prepare5_16(src, low, high);
+  uint32x4x2_t sum;
+  sum.val[0] = Sum5W_32(low);
+  sum.val[1] = Sum5W_32(high);
+  return sum;
+}
+
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+                   uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                   uint16x8_t* const row5_1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+  const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+  *row5_1 = vaddq_u16(sum04_hi, *row3_1);
+}
+
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+                   uint16x8_t* const row5) {
+  uint8x8_t s[5];
+  Prepare5_8(src, s);
+  const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
+  const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
+  *row3 = vaddw_u8(sum12, s[3]);
+  *row5 = vaddq_u16(sum04, *row3);
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+                   uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+  const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+  *row_sq3 = vaddw_u16(sum12, src[3]);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+                   uint32x4x2_t* const row_sq5) {
+  uint16x4_t low[5], high[5];
+  Prepare5_16(sq, low, high);
+  SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
+  SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
+}
+
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  SumHorizontal(src, row3, row5);
+  SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return vaddw_u8(sum3,
+                  (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
+}
+
+inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
+  const uint32x4_t sum = Sum3W_32(src);
+  const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+  return vaddw_u16(sum3, src[1]);
+}
+
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t d;
+  Prepare3_16(src, low, high);
+  d.val[0] = Sum343W(low);
+  d.val[1] = Sum343W(high);
+  return d;
+}
+
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return vaddw_u8(sum5,
+                  (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
+}
+
+inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
+  const uint32x4_t sum = Sum3W_32(src);
+  const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+  const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+  return vaddw_u16(sum5, src[1]);
+}
+
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t d;
+  Prepare3_16(src, low, high);
+  d.val[0] = Sum565W(low);
+  d.val[1] = Sum565W(high);
+  return d;
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes = kOverreadInBytesPass1 - width;
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
+  do {
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = Load1MsanU8(src, overread_in_bytes);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row3, row5;
+      uint32x4x2_t row_sq3, row_sq5;
+      x -= 8;
+      src += 8;
+      s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+      sq[1] = SquareLo8(s[1]);
+      SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
+      vst1q_u16(sum3, row3);
+      vst1q_u16(sum5, row5);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      s[0] = s[1];
+      sq[0] = sq[1];
+      sum3 += 8;
+      sum5 += 8;
+      square_sum3 += 8;
+      square_sum5 += 8;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
+  do {
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = Load1MsanU8(src, overread_in_bytes);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row;
+      uint32x4x2_t row_sq;
+      x -= 8;
+      src += 8;
+      s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+      sq[1] = SquareLo8(s[1]);
+      if (size == 3) {
+        row = Sum3Horizontal(s);
+        row_sq = Sum3WHorizontal(sq);
+      } else {
+        row = Sum5Horizontal(s);
+        row_sq = Sum5WHorizontal(sq);
+      }
+      vst1q_u16(sums, row);
+      StoreAligned32U32(square_sums, row_sq);
+      s[0] = s[1];
+      sq[0] = sq[1];
+      sums += 8;
+      square_sums += 8;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+                              const uint32_t scale) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const uint32x4_t dxd = vmull_u16(sum, sum);
+  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+  // Ensure |p| does not underflow by using saturating subtraction.
+  const uint32x4_t p = vqsubq_u32(axn, dxd);
+  const uint32x4_t pxs = vmulq_n_u32(p, scale);
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
+  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+  return vmovn_u32(shifted);
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
+inline void CalculateIntermediate(const uint16x8_t sum,
+                                  const uint32x4x2_t sum_sq,
+                                  const uint32_t scale, uint8x16_t* const ma,
+                                  uint16x8_t* const b) {
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
+  const uint16x4_t z1 =
+      CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
+  const uint16x8_t z01 = vcombine_u16(z0, z1);
+  const uint8x8_t idx = vqmovn_u16(z01);
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, index);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  // offset == 0 is assumed to be the first call to this function. Note
+  // vget_high_u8(*ma) is not used in this case to avoid a -Wuninitialized
+  // warning with some versions of gcc. vdup_n_u8(0) could work as well, but in
+  // most cases clang and gcc generated better code with this version.
+  *ma = (offset == 0) ? vcombine_u8(val, val)
+                      : vcombine_u8(vget_low_u8(*ma), val);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
+  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+  const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
+  const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
+  *b = vcombine_u16(b_lo, b_hi);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+                                   const uint32x4x2_t sq5[5],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum5_16(s5);
+  const uint32x4x2_t sum_sq = Sum5_32(sq5);
+  CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+                                   const uint32x4x2_t sq3[3],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum3_16(s3);
+  const uint32x4x2_t sum_sq = Sum3_32(sq3);
+  CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+                         const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                         uint16x8_t* const sum_ma444,
+                         uint32x4x2_t* const sum_b343,
+                         uint32x4x2_t* const sum_b444, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = vaddw_u8(
+      sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t sum_b111;
+  Prepare3_16(b3, low, high);
+  sum_b111.val[0] = Sum3W_32(low);
+  sum_b111.val[1] = Sum3W_32(high);
+  sum_b444->val[0] = vshlq_n_u32(sum_b111.val[0], 2);
+  sum_b444->val[1] = vshlq_n_u32(sum_b111.val[1], 2);
+  sum_b343->val[0] = vsubq_u32(sum_b444->val[0], sum_b111.val[0]);
+  sum_b343->val[1] = vsubq_u32(sum_b444->val[1], sum_b111.val[1]);
+  sum_b343->val[0] = vaddw_u16(sum_b343->val[0], low[1]);
+  sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  vst1q_u32(b343 + x + 0, sum_b343->val[0]);
+  vst1q_u32(b343 + x + 4, sum_b343->val[1]);
+  vst1q_u32(b444 + x + 0, sum_b444->val[0]);
+  vst1q_u32(b444 + x + 4, sum_b444->val[1]);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+                         const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                         uint32x4x2_t* const sum_b343, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4x2_t sum_b444;
+  Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+                       ma343, ma444, b343, b444);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+                         const ptrdiff_t x, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4x2_t sum_b343;
+  Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+                       b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    uint8x16_t s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t* const ma,
+    uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[3] = Sum5Horizontal(s[0][0]);
+  s5[4] = Sum5Horizontal(s[1][0]);
+  sq5[3] = Sum5WHorizontal(sq[0]);
+  sq5[4] = Sum5WHorizontal(sq[1]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    uint8x16_t s[2][2], const ptrdiff_t x, const uint32_t scale,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+    uint16x8_t sq[2][4], uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  sq5[3] = Sum5WHorizontal(sq[0] + 1);
+  sq5[4] = Sum5WHorizontal(sq[1] + 1);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  sq5[3] = Sum5WHorizontal(sq[0] + 2);
+  sq5[4] = Sum5WHorizontal(sq[1] + 2);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    uint8x16_t* const s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint16x8_t sq[2],
+    uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  sq[0] = SquareLo8(s[0]);
+  sq[1] = SquareHi8(s[0]);
+  s5[3] = s5[4] = Sum5Horizontal(*s);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  sq[1] = SquareLo8(s[1]);
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[2] = SquareHi8(s[1]);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    uint8x16_t* const s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint16x8_t sq[2], uint8x16_t* const ma,
+    uint16x8_t* const b) {
+  uint16x8_t s3[3];
+  uint32x4x2_t sq3[3];
+  sq[0] = SquareLo8(*s);
+  sq[1] = SquareHi8(*s);
+  s3[2] = Sum3Horizontal(*s);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2], s3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[3],
+    uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s3[4];
+  uint32x4x2_t sq3[3];
+  sq[1] = SquareLo8(s[1]);
+  Sum3Horizontal<8>(s, s3 + 2);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2] + x, s3[2]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+  sq[2] = SquareHi8(s[1]);
+  sq3[2] = Sum3WHorizontal(sq + 1);
+  vst1q_u16(sum3[2] + x + 8, s3[3]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16(sum3, x + 8, s3 + 1);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    uint8x16_t s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+  uint16x8_t s3[4], s5[5];
+  uint32x4x2_t sq3[4], sq5[5];
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+  CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8x16_t s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16x8_t sq[2][4], uint8x16_t ma3[2][2], uint16x8_t b3[2][3],
+    uint8x16_t ma5[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][4], s5[2][5];
+  uint32x4x2_t sq3[4], sq5[5];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x, s3[0][2]);
+  vst1q_u16(sum3[3] + x, s3[0][3]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+  CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+                            &b3[1][1]);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16(sum3, x + 8, s3[1]);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+  CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+                            &b3[1][2]);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    uint8x16_t* const s, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[2], uint8x16_t* const ma3, uint8x16_t* const ma5,
+    uint16x8_t* const b3, uint16x8_t* const b5) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4x2_t sq3[3], sq5[5];
+  sq[0] = SquareLo8(s[0]);
+  sq[1] = SquareHi8(s[0]);
+  SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    uint8x16_t s[2], const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2], uint16x8_t b3[2],
+    uint16x8_t b5[2]) {
+  uint16x8_t s3[2][3], s5[2][5];
+  uint32x4x2_t sq3[3], sq5[5];
+  sq[1] = SquareLo8(s[1]);
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq, &sq3[2], &sq5[3]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+  sq[2] = SquareHi8(s[1]);
+  SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+  LoadAligned16x2U16(sum3, x + 8, s3[1]);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    uint16_t* ma565, uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[0] = Sum565<0>(masx);
+    b[0] = Sum565W(bs);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
+    uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  s[0] = Load1QMsanU8(src, overread_in_bytes);
+  BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3];
+    s[1] = Load1QMsanU8(src + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    if (calculate444) {
+      Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+                      b444 + 8);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      uint16x8_t ma[2];
+      uint32x4x2_t b[2];
+      ma[0] = Sum343<0>(ma3x);
+      b[0] = Sum343W(bs);
+      vst1q_u16(ma343, ma[0]);
+      vst1q_u32(b343 + 0, b[0].val[0]);
+      vst1q_u32(b343 + 4, b[0].val[1]);
+      ma[1] = Sum343<8>(ma3x);
+      b[1] = Sum343W(bs + 1);
+      vst1q_u16(ma343 + 8, ma[1]);
+      vst1q_u32(b343 + 8, b[1].val[0]);
+      vst1q_u32(b343 + 12, b[1].val[1]);
+    }
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+    uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t ma3x[3], ma5x[3];
+    uint32x4x2_t b[2];
+
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343<0>(ma3x);
+    ma[1] = Sum343<8>(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    b[0] = Sum343W(b3[0] + 0);
+    b[1] = Sum343W(b3[0] + 1);
+    StoreAligned64U32(b343[0] + x, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565<0>(ma5x);
+    ma[1] = Sum565<8>(ma5x);
+    StoreAligned32U16(ma565, ma);
+    b[0] = Sum565W(b5);
+    b[1] = Sum565W(b5 + 1);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    ma5[0] = ma5[1];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t ma,
+                              const uint32x4_t b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const int32x4_t v = vreinterpretq_s32_u32(vmlsl_u16(b, ma, src));
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return vrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint8x8_t src,
+                                         const uint16x8_t ma,
+                                         const uint32x4x2_t b) {
+  const uint16x8_t src_u16 = vmovl_u8(src);
+  const int16x4_t dst_lo =
+      FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(ma), b.val[0]);
+  const int16x4_t dst_hi =
+      FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(ma), b.val[1]);
+  return vcombine_s16(dst_lo, dst_hi);  // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint8x8_t s,
+                                              uint16x8_t ma[2],
+                                              uint32x4x2_t b[2]) {
+  const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+  uint32x4x2_t b_sum;
+  b_sum.val[0] = vaddq_u32(b[0].val[0], b[1].val[0]);
+  b_sum.val[1] = vaddq_u32(b[0].val[1], b[1].val[1]);
+  return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
+                                              uint16x8_t ma[3],
+                                              uint32x4x2_t b[3]) {
+  const uint16x8_t ma_sum = Sum3_16(ma);
+  const uint32x4x2_t b_sum = Sum3_32(b);
+  return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
+  const int16x4_t v_lo =
+      vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t v_hi =
+      vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+  const int16x8_t d =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+  return vqmovun_s16(d);
+}
+
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+  v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+  v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+  return SelfGuidedFinal(src, v);
+}
+
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const uint32_t scale,
+    const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
+    uint8_t* const dst) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  s[0][0] = Load1QMsanU8(src0, overread_in_bytes);
+  s[1][0] = Load1QMsanU8(src1, overread_in_bytes);
+
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    int16x8_t p0, p1;
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
+    b[1] = Sum565W(bs);
+    vst1q_u16(ma565[1] + x, ma[1]);
+    vst1q_u32(b565[1] + x + 0, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 4, b[1].val[1]);
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
+    ma[0] = vld1q_u16(ma565[0] + x);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+    const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+    const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+    const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+    const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(const uint8_t* const src,
+                                  const uint8_t* const src0, const int width,
+                                  const uint32_t scale, const int16_t w0,
+                                  uint16_t* const sum5[5],
+                                  uint32_t* const square_sum5[5],
+                                  uint16_t* ma565, uint32_t* b565,
+                                  uint8_t* const dst) {
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[4];
+  s[0] = vld1q_u8(src0);
+
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    s[1] = vld1q_u8(src0 + x + 16);
+
+    BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas,
+                                bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
+    b[1] = Sum565W(bs);
+    ma[0] = vld1q_u16(ma565);
+    b[0].val[0] = vld1q_u32(b565 + 0);
+    b[0].val[1] = vld1q_u32(b565 + 4);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    bs[0] = bs[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + 8);
+    b[0].val[0] = vld1q_u32(b565 + 8);
+    b[0].val[1] = vld1q_u32(b565 + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint32_t scale, const int16_t w0, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint16_t* const ma343[3],
+    uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
+    uint8_t* const dst) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  s[0] = vld1q_u8(src0);
+
+  BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[3];
+    uint8x16_t ma3x[3];
+    uint32x4x2_t b[3];
+    s[1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+                    b444[1]);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    ma[0] = vld1q_u16(ma343[0] + x);
+    ma[1] = vld1q_u16(ma444[0] + x);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 0);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 4);
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+                    b343[2], b444[1]);
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444[3],
+    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[3][3];
+    uint8x16_t ma3x[2][3], ma5x[3];
+    uint32x4x2_t b[3][3];
+    int16x8_t p[2][2];
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+                    ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+                    b343[3], b444[2]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0][1] = Sum565<0>(ma5x);
+    b[0][1] = Sum565W(b5);
+    vst1q_u16(ma565[1] + x, ma[0][1]);
+    vst1q_u32(b565[1] + x, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x);
+    ma[1][1] = vld1q_u16(ma444[0] + x);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
+    p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
+    p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+    const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+    const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+    Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+                    &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+                    ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565<8>(ma5x);
+    b[0][1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+    p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+    p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+    const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+    const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  uint8x16_t s[2], ma3[2], ma5[2];
+  uint16x8_t sq[4], ma[3], b3[3], b5[3];
+  uint32x4x2_t b[3];
+  s[0] = vld1q_u8(src0);
+
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], &b3[0], &b5[0]);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
+    s[1] = vld1q_u8(src0 + x + 16);
+
+    BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3,
+                               square_sum5, sq + 1, ma3, ma5, &b3[1], &b5[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565<0>(ma5x);
+    b[1] = Sum565W(b5);
+    Prepare3_8<0>(ma3, ma3x);
+    ma[2] = Sum343<0>(ma3x);
+    b[2] = Sum343W(b3);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x);
+    b[0].val[0] = vld1q_u32(b565 + x + 0);
+    b[0].val[1] = vld1q_u32(b565 + x + 4);
+    p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    b[0].val[0] = vld1q_u32(b343 + x + 0);
+    b[0].val[1] = vld1q_u32(b343 + x + 4);
+    b[1].val[0] = vld1q_u32(b444 + x + 0);
+    b[1].val[1] = vld1q_u32(b444 + x + 4);
+    p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+    ma[1] = Sum565<8>(ma5x);
+    b[1] = Sum565W(b5 + 1);
+    b5[0] = b5[2];
+    ma[2] = Sum343<8>(ma3x);
+    b[2] = Sum343W(b3 + 1);
+    b3[0] = b3[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    b[0].val[0] = vld1q_u32(b565 + x + 8);
+    b[0].val[1] = vld1q_u32(b565 + x + 12);
+    p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    b[0].val[0] = vld1q_u32(b343 + x + 8);
+    b[0].val[1] = vld1q_u32(b343 + x + 12);
+    b[1].val[0] = vld1q_u32(b444 + x + 8);
+    b[1].val[1] = vld1q_u32(b444 + x + 12);
+    p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
+                         b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343,
+              ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565,
+              dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+                     ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+                     dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0],
+                          b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+                          scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
+                                 nullptr, b343[0], nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
+                                ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3,
+                   ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+
+#if LIBGAV1_MSAN
+  // Initialize to prevent msan warnings when intermediate overreads occur.
+  memset(sgr_buffer, 0, sizeof(SgrBuffer));
+#endif
+
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->loop_restorations[0] = WienerFilter_NEON;
+  dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_restoration_neon.h b/src/dsp/arm/loop_restoration_neon.h
new file mode 100644 (file)
index 0000000..b9a4803
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// This function is not thread-safe.
+void LoopRestorationInit_NEON();
+void LoopRestorationInit10bpp_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
new file mode 100644 (file)
index 0000000..ecc67f8
--- /dev/null
@@ -0,0 +1,789 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+template <int subsampling_y>
+inline uint8x8_t GetMask4x2(const uint8_t* mask) {
+  if (subsampling_y == 1) {
+    const uint8x16x2_t mask_val = vld2q_u8(mask);
+    const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+    const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz));
+    const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz));
+
+    const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23);
+    // Use a halving add to work around the case where all |mask| values are 64.
+    return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]),
+                               vreinterpret_u8_u32(row_02_13.val[1])),
+                      1);
+  }
+  // subsampling_x == 1
+  const uint8x8x2_t mask_val = vld2_u8(mask);
+  return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetMask8(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const uint8x16x2_t mask_val = vld2q_u8(mask);
+    const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+    // Use a halving add to work around the case where all |mask| values are 64.
+    return vrshr_n_u8(
+        vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1);
+  }
+  if (subsampling_x == 1) {
+    const uint8x8x2_t mask_val = vld2_u8(mask);
+    return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  return vld1_u8(mask);
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+                                  const int16_t* LIBGAV1_RESTRICT const pred_1,
+                                  const int16x8_t pred_mask_0,
+                                  const int16x8_t pred_mask_1,
+                                  uint8_t* LIBGAV1_RESTRICT dst,
+                                  const ptrdiff_t dst_stride) {
+  const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+  const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const int32x4_t weighted_pred_0_lo =
+      vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+  const int32x4_t weighted_pred_0_hi =
+      vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+  const int32x4_t weighted_combo_lo = vmlal_s16(
+      weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+  const int32x4_t weighted_combo_hi =
+      vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+                vget_high_s16(pred_val_1));
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //         (1 << kBitdepth8) - 1));
+  const uint8x8_t result =
+      vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+                                  vshrn_n_s32(weighted_combo_hi, 6)),
+                     4);
+  StoreLo4(dst, result);
+  StoreHi4(dst + dst_stride, result);
+}
+
+template <int subsampling_y>
+inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT mask,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  constexpr int subsampling_x = 1;
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  // Compound predictors use int16_t values and need to multiply long because
+  // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply
+  // int16_t by int8_t and accumulate into int32_t instruction.
+  int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+  int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+  pred_0 += 4 << subsampling_x;
+  pred_1 += 4 << subsampling_x;
+  mask += mask_stride << (subsampling_x + subsampling_y);
+  dst += dst_stride << subsampling_x;
+
+  pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+  pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+}
+
+template <int subsampling_y>
+inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                                 const int height,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride);
+    return;
+  }
+  constexpr int subsampling_x = 1;
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = 0;
+  do {
+    int16x8_t pred_mask_0 =
+        vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask)));
+    int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
+
+    pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
+
+    pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
+
+    pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
+    y += 8;
+  } while (y < height);
+}
+
+inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0,
+                              const int16_t* LIBGAV1_RESTRICT pred_1,
+                              const int16x8_t pred_mask_0,
+                              const int16x8_t pred_mask_1) {
+  // First 8 values.
+  const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+  const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const int32x4_t weighted_pred_lo =
+      vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+  const int32x4_t weighted_pred_hi =
+      vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+  const int32x4_t weighted_combo_lo = vmlal_s16(
+      weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+  const int32x4_t weighted_combo_hi = vmlal_s16(
+      weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1));
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+                                     vshrn_n_s32(weighted_combo_hi, 6)),
+                        4);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                                 const int height,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = height;
+  do {
+    const int16x8_t pred_mask_0 =
+        ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask));
+    // 64 - mask
+    const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    const uint8x8_t result =
+        CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1);
+    vst1_u8(dst, result);
+    dst += dst_stride;
+    mask += 8 << (subsampling_x + subsampling_y);
+    pred_0 += 8;
+    pred_1 += 8;
+  } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const uint8x16x2_t mask_val0 = vld2q_u8(mask);
+    const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride);
+    const uint8x16_t combined_horz0 =
+        vaddq_u8(mask_val0.val[0], mask_val0.val[1]);
+    const uint8x16_t combined_horz1 =
+        vaddq_u8(mask_val1.val[0], mask_val1.val[1]);
+    // Use a halving add to work around the case where all |mask| values are 64.
+    return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1);
+  }
+  if (subsampling_x == 1) {
+    const uint8x16x2_t mask_val = vld2q_u8(mask);
+    return vrhaddq_u8(mask_val.val[0], mask_val.val[1]);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  return vld1q_u8(mask);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           const ptrdiff_t /*prediction_stride_1*/,
+                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                           const ptrdiff_t mask_stride, const int width,
+                           const int height, void* LIBGAV1_RESTRICT dest,
+                           const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  if (width == 4) {
+    MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst,
+                                        dst_stride);
+    return;
+  }
+  if (width == 8) {
+    MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr,
+                                                       height, dst, dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0));
+      const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0));
+      // 64 - mask
+      const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo);
+      const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi);
+
+      uint8x8_t result;
+      result =
+          CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo);
+      vst1_u8(dst + x, result);
+
+      result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi,
+                            pred_mask_1_hi);
+      vst1_u8(dst + x + 8, result);
+
+      x += 16;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
+                                      ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    return GetMask4x2<subsampling_y>(mask);
+  }
+  // When using intra or difference weighted masks, the function doesn't use
+  // subsampling, so |mask_stride| may be 4 or 8.
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val0 = Load4(mask);
+  return Load4<1>(mask + mask_stride, mask_val0);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+    const uint8_t* LIBGAV1_RESTRICT const pred_0,
+    uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+    const uint8x8_t pred_mask_0, const uint8x8_t pred_mask_1) {
+  const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+  uint8x8_t pred_val_1 = Load4(pred_1);
+  pred_val_1 = Load4<1>(pred_1 + pred_stride_1, pred_val_1);
+
+  const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+  const uint16x8_t weighted_combo =
+      vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+  const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+  StoreLo4(pred_1, result);
+  StoreHi4(pred_1 + pred_stride_1, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_NEON(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  uint8x8_t pred_mask_1 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+
+  pred_mask_1 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_NEON(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride, const int height) {
+  if (height == 4) {
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    return;
+  }
+  int y = 0;
+  do {
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp8xH_NEON(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride, const int height) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  int y = height;
+  do {
+    const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask);
+    // 64 - mask
+    const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+    const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+    const uint8x8_t pred_val_1 = vld1_u8(pred_1);
+    const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+    // weighted_pred0 + weighted_pred1
+    const uint16x8_t weighted_combo =
+        vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+    const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+    vst1_u8(pred_1, result);
+
+    pred_0 += 8;
+    pred_1 += pred_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend8bpp_NEON(
+    const uint8_t* LIBGAV1_RESTRICT prediction_0,
+    uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height) {
+  if (width == 4) {
+    InterIntraMaskBlending8bpp4xH_NEON<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
+  if (width == 8) {
+    InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const uint8x16_t mask_inverter = vdupq_n_u8(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1);
+      const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0);
+      prediction_0 += 8;
+      const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0);
+      prediction_0 += 8;
+      // Ensure armv7 build combines the load.
+      const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x);
+      const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1);
+      const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1);
+      const uint16x8_t weighted_pred_0_lo =
+          vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo);
+      // weighted_pred0 + weighted_pred1
+      const uint16x8_t weighted_combo_lo =
+          vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo);
+      const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6);
+      vst1_u8(prediction_1 + x, result_lo);
+      const uint16x8_t weighted_pred_0_hi =
+          vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi);
+      // weighted_pred0 + weighted_pred1
+      const uint16x8_t weighted_combo_hi = vmlal_u8(
+          weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi);
+      const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6);
+      vst1_u8(prediction_1 + x + 8, result_hi);
+
+      x += 16;
+    } while (x < width);
+    prediction_1 += prediction_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1>;
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_NEON<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_NEON<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_NEON<1, 1>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int subsampling_x, int subsampling_y>
+inline uint16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const uint8x8_t mask_val0 = vld1_u8(mask);
+    const uint8x8_t mask_val1 = vld1_u8(mask + (mask_stride << subsampling_y));
+    uint16x8_t final_val = vpaddlq_u8(vcombine_u8(mask_val0, mask_val1));
+    if (subsampling_y == 1) {
+      const uint8x8_t next_mask_val0 = vld1_u8(mask + mask_stride);
+      const uint8x8_t next_mask_val1 = vld1_u8(mask + mask_stride * 3);
+      final_val = vaddq_u16(
+          final_val, vpaddlq_u8(vcombine_u8(next_mask_val0, next_mask_val1)));
+    }
+    return vrshrq_n_u16(final_val, subsampling_y + 1);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val0 = Load4(mask);
+  const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
+  return vmovl_u8(mask_val);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    uint16x8_t mask_val = vpaddlq_u8(vld1q_u8(mask));
+    if (subsampling_y == 1) {
+      const uint16x8_t next_mask_val = vpaddlq_u8(vld1q_u8(mask + mask_stride));
+      mask_val = vaddq_u16(mask_val, next_mask_val);
+    }
+    return vrshrq_n_u16(mask_val, 1 + subsampling_y);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val = vld1_u8(mask);
+  return vmovl_u8(mask_val);
+}
+
+template <bool is_inter_intra>
+uint16x8_t SumWeightedPred(const uint16x8_t pred_mask_0,
+                           const uint16x8_t pred_mask_1,
+                           const uint16x8_t pred_val_0,
+                           const uint16x8_t pred_val_1) {
+  if (is_inter_intra) {
+    // dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+    //     mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+    uint16x8_t sum = vmulq_u16(pred_mask_1, pred_val_0);
+    sum = vmlaq_u16(sum, pred_mask_0, pred_val_1);
+    return vrshrq_n_u16(sum, 6);
+  } else {
+    // int res = (mask_value * prediction_0[x] +
+    //      (64 - mask_value) * prediction_1[x]) >> 6;
+    const uint32x4_t weighted_pred_0_lo =
+        vmull_u16(vget_low_u16(pred_mask_0), vget_low_u16(pred_val_0));
+    const uint32x4_t weighted_pred_0_hi = VMullHighU16(pred_mask_0, pred_val_0);
+    uint32x4x2_t sum;
+    sum.val[0] = vmlal_u16(weighted_pred_0_lo, vget_low_u16(pred_mask_1),
+                           vget_low_u16(pred_val_1));
+    sum.val[1] = VMlalHighU16(weighted_pred_0_hi, pred_mask_1, pred_val_1);
+    return vcombine_u16(vshrn_n_u32(sum.val[0], 6), vshrn_n_u32(sum.val[1], 6));
+  }
+}
+
+template <bool is_inter_intra, int width, int bitdepth = 10>
+inline void StoreShiftedResult(uint8_t* dst, const uint16x8_t result,
+                               const ptrdiff_t dst_stride = 0) {
+  if (is_inter_intra) {
+    if (width == 4) {
+      // Store 2 lines of width 4.
+      assert(dst_stride != 0);
+      vst1_u16(reinterpret_cast<uint16_t*>(dst), vget_low_u16(result));
+      vst1_u16(reinterpret_cast<uint16_t*>(dst + dst_stride),
+               vget_high_u16(result));
+    } else {
+      // Store 1 line of width 8.
+      vst1q_u16(reinterpret_cast<uint16_t*>(dst), result);
+    }
+  } else {
+    // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+    // dst[x] = static_cast<Pixel>(
+    //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+    //           (1 << kBitdepth8) - 1));
+    constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+    const uint16x8_t compound_result =
+        vminq_u16(vrshrq_n_u16(vqsubq_u16(result, vdupq_n_u16(kCompoundOffset)),
+                               inter_post_round_bits),
+                  vdupq_n_u16((1 << bitdepth) - 1));
+    if (width == 4) {
+      // Store 2 lines of width 4.
+      assert(dst_stride != 0);
+      vst1_u16(reinterpret_cast<uint16_t*>(dst), vget_low_u16(compound_result));
+      vst1_u16(reinterpret_cast<uint16_t*>(dst + dst_stride),
+               vget_high_u16(compound_result));
+    } else {
+      // Store 1 line of width 8.
+      vst1q_u16(reinterpret_cast<uint16_t*>(dst), compound_result);
+    }
+  }
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlend4x2_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                              const uint16_t* LIBGAV1_RESTRICT pred_1,
+                              const ptrdiff_t pred_stride_1,
+                              const uint8_t* LIBGAV1_RESTRICT mask,
+                              const uint16x8_t mask_inverter,
+                              const ptrdiff_t mask_stride,
+                              uint8_t* LIBGAV1_RESTRICT dst,
+                              const ptrdiff_t dst_stride) {
+  // This works because stride == width == 4.
+  const uint16x8_t pred_val_0 = vld1q_u16(pred_0);
+  const uint16x8_t pred_val_1 =
+      is_inter_intra
+          ? vcombine_u16(vld1_u16(pred_1), vld1_u16(pred_1 + pred_stride_1))
+          : vld1q_u16(pred_1);
+  const uint16x8_t pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  const uint16x8_t pred_mask_1 = vsubq_u16(mask_inverter, pred_mask_0);
+  const uint16x8_t weighted_pred_sum = SumWeightedPred<is_inter_intra>(
+      pred_mask_0, pred_mask_1, pred_val_0, pred_val_1);
+
+  StoreShiftedResult<is_inter_intra, 4>(dst, weighted_pred_sum, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlending4x4_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                                 const uint16_t* LIBGAV1_RESTRICT pred_1,
+                                 const ptrdiff_t pred_stride_1,
+                                 const uint8_t* LIBGAV1_RESTRICT mask,
+                                 const ptrdiff_t mask_stride,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  // Double stride because the function works on 2 lines at a time.
+  const ptrdiff_t mask_stride_y = mask_stride << (subsampling_y + 1);
+  const ptrdiff_t dst_stride_y = dst_stride << 1;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+
+  MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+      pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+      dst_stride);
+
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride_y;
+  dst += dst_stride_y;
+
+  MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+      pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+      dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlending4xH_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                                 const uint16_t* LIBGAV1_RESTRICT pred_1,
+                                 const ptrdiff_t pred_stride_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                                 const ptrdiff_t mask_stride, const int height,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlending4x4_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  // Double stride because the function works on 2 lines at a time.
+  const ptrdiff_t mask_stride_y = mask_stride << (subsampling_y + 1);
+  const ptrdiff_t dst_stride_y = dst_stride << 1;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  int y = 0;
+  do {
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+void MaskBlend8_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                     const uint16_t* LIBGAV1_RESTRICT pred_1,
+                     const uint8_t* LIBGAV1_RESTRICT mask,
+                     const uint16x8_t mask_inverter,
+                     const ptrdiff_t mask_stride,
+                     uint8_t* LIBGAV1_RESTRICT dst) {
+  const uint16x8_t pred_val_0 = vld1q_u16(pred_0);
+  const uint16x8_t pred_val_1 = vld1q_u16(pred_1);
+  const uint16x8_t pred_mask_0 =
+      GetMask8<subsampling_x, subsampling_y>(mask, mask_stride);
+  const uint16x8_t pred_mask_1 = vsubq_u16(mask_inverter, pred_mask_0);
+  const uint16x8_t weighted_pred_sum = SumWeightedPred<is_inter_intra>(
+      pred_mask_0, pred_mask_1, pred_val_0, pred_val_1);
+
+  StoreShiftedResult<is_inter_intra, 8>(dst, weighted_pred_sum);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           const ptrdiff_t prediction_stride_1,
+                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                           const ptrdiff_t mask_stride, const int width,
+                           const int height, void* LIBGAV1_RESTRICT dest,
+                           const ptrdiff_t dst_stride) {
+  if (!is_inter_intra) {
+    assert(prediction_stride_1 == width);
+  }
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  if (width == 4) {
+    MaskBlending4xH_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, prediction_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const ptrdiff_t mask_stride_y = mask_stride << subsampling_y;
+  const uint8_t* mask = mask_ptr;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      MaskBlend8_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+          pred_0 + x, pred_1 + x, mask + (x << subsampling_x), mask_inverter,
+          mask_stride,
+          reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(dst) + x));
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += prediction_stride_1;
+    mask += mask_stride_y;
+  } while (++y < height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0, false>;
+  dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0, false>;
+  dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1, false>;
+
+  dsp->mask_blend[0][1] = MaskBlend_NEON<0, 0, true>;
+  dsp->mask_blend[1][1] = MaskBlend_NEON<1, 0, true>;
+  dsp->mask_blend[2][1] = MaskBlend_NEON<1, 1, true>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/mask_blend_neon.h b/src/dsp/arm/mask_blend_neon.h
new file mode 100644 (file)
index 0000000..c24f2f8
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
new file mode 100644 (file)
index 0000000..144adf7
--- /dev/null
@@ -0,0 +1,378 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8_t LoadDivision(const int8x8x2_t division_table,
+                              const int8x8_t reference_offset) {
+  const int8x8_t kOne = vcreate_s8(0x0100010001000100);
+  const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
+  const int8x8_t t = vadd_s8(reference_offset, reference_offset);
+  const int8x8x2_t tt = vzip_s8(t, t);
+  const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
+  const int8x16_t idx = vaddq_s8(t1, kOneQ);
+  const int8x8_t idx_low = vget_low_s8(idx);
+  const int8x8_t idx_high = vget_high_s8(idx);
+  const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
+  const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
+  return vcombine_s16(d0, d1);
+}
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+                              const int numerator) {
+  const int32x4_t m0 = vmull_s16(mv, denominator);
+  const int32x4_t m = vmulq_n_s32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+  return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x8_t MvProjectionClip(const int16x8_t mv,
+                                  const int16x8_t denominator,
+                                  const int numerator) {
+  const int16x4_t mv0 = vget_low_s16(mv);
+  const int16x4_t mv1 = vget_high_s16(mv);
+  const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
+  const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
+  const int16x8_t projection = vcombine_s16(s0, s1);
+  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+  const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
+  return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
+  const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
+  const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
+  const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
+  const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
+  const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
+  const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
+  const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
+  return vqmovn_s16(offset2);
+}
+
+inline void GetPosition(
+    const int8x8x2_t division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const int8x8_t r_offsets, const int8x8_t source_reference_type8,
+    const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
+    const int16x8_t d_sign, const int delta, int8x8_t* const r,
+    int8x8_t* const position_y8, int8x8_t* const position_x8,
+    int64_t* const skip_64, int32x4_t mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+  *r = vtbl1_s8(r_offsets, source_reference_type8);
+  const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
+  int16x8_t projection_mv[2];
+  mvs[0] = vld1q_s32(mv_int + 0);
+  mvs[1] = vld1q_s32(mv_int + 4);
+  // Deinterlace x and y components
+  const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
+  const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
+  const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
+  // Do not update the motion vector if the block position is not valid or
+  // if position_x8 is outside the current range of x8_start and x8_end.
+  // Note that position_y8 will always be within the range of y8_start and
+  // y8_end.
+  // After subtracting the base, valid projections are within 8-bit.
+  *position_y8 = Project_NEON(projection_mv[0], d_sign);
+  const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
+  const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
+  *position_x8 = vqadd_s8(position_x, k01234567);
+  const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8);
+  const int x8_floor = std::max(
+      x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
+  const int x8_ceiling = std::min(
+      x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset);  // [0, 16]
+  const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
+  const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
+  const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
+  const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
+  const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
+  const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
+  const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
+  const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
+  const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
+  *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
+}
+
+template <int idx>
+inline void Store(const int16x8_t position, const int8x8_t reference_offset,
+                  const int32x4_t mv, int8_t* dst_reference_offset,
+                  MotionVector* dst_mv) {
+  const ptrdiff_t offset = vgetq_lane_s16(position, idx);
+  auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
+  vst1q_lane_s32(d_mv, mv, idx & 3);
+  vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const int16x8_t position,
+                       const int8x8_t reference_offset, const int32x4_t mv,
+                       int8_t* dst_reference_offset, MotionVector* dst_mv) {
+  if (skips[idx] == 0) {
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+  }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
+                                      const int reference_to_current_with_sign,
+                                      const int dst_sign, const int y8_start,
+                                      const int y8_end, const int x8_start,
+                                      const int x8_end,
+                                      TemporalMotionField* const motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+  const int leftover = adjusted_x8_end - adjusted_x8_end8;
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  const int16x8_t d_sign = vdupq_n_s16(dst_sign);
+
+  static_assert(sizeof(int8_t) == sizeof(bool), "");
+  static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+  static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+  assert(dst_sign == 0 || dst_sign == -1);
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+  assert((adjusted_x8_start & 7) == 0);
+  // The final position calculation is represented with int16_t. Valid
+  // position_y8 from its base is at most 7. After considering the horizontal
+  // offset which is at most |stride - 1|, we have the following assertion,
+  // which means this optimization works for frame width up to 32K (each
+  // position is a 8x8 block).
+  assert(8 * stride <= 32768);
+  const int8x8_t skip_reference =
+      vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
+  const int8x8_t r_offsets = vld1_s8(reference_offsets);
+  const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
+  int8x8x2_t division_table;
+  division_table.val[0] = vget_low_s8(table);
+  division_table.val[1] = vget_high_s8(table);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;                         // [-7, 0]
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);  // [1, 8]
+    const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
+    const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
+    int x8;
+
+    for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+      const int8x8_t source_reference_type8 =
+          vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
+      const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
+      const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+      // Early termination #1 if all are skips. Chance is typically ~30-40%.
+      if (early_skip == -1) continue;
+      int64_t skip_64;
+      int8x8_t r, position_x8, position_y8;
+      int32x4_t mvs[2];
+      GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+                  x8_end, x8, r_offsets, source_reference_type8, skip_r,
+                  y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
+                  &position_x8, &skip_64, mvs);
+      // Early termination #2 if all are skips.
+      // Chance is typically ~15-25% after Early termination #1.
+      if (skip_64 == -1) continue;
+      const int16x8_t p_y = vmovl_s8(position_y8);
+      const int16x8_t p_x = vmovl_s8(position_x8);
+      const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+      const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+      if (skip_64 == 0) {
+        // Store all. Chance is typically ~70-85% after Early termination #2.
+        Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+      } else {
+        // Check and store each.
+        // Chance is typically ~15-30% after Early termination #2.
+        // The compiler is smart enough to not create the local buffer skips[].
+        int8_t skips[8];
+        memcpy(skips, &skip_64, sizeof(skips));
+        CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+      }
+    }
+
+    // The following leftover processing cannot be moved out of the do...while
+    // loop. Doing so may change the result storing orders of the same position.
+    if (leftover > 0) {
+      // Use SIMD only when leftover is at least 4, and there are at least 8
+      // elements in a row.
+      if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+        // Process the last 8 elements to avoid loading invalid memory. Some
+        // elements may have been processed in the above loop, which is OK.
+        const int delta = 8 - leftover;
+        x8 = adjusted_x8_end - 8;
+        const int8x8_t source_reference_type8 = vld1_s8(
+            reinterpret_cast<const int8_t*>(source_reference_types + x8));
+        const int8x8_t skip_r =
+            vtbl1_s8(skip_reference, source_reference_type8);
+        const int64_t early_skip =
+            vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+        // Early termination #1 if all are skips.
+        if (early_skip != -1) {
+          int64_t skip_64;
+          int8x8_t r, position_x8, position_y8;
+          int32x4_t mvs[2];
+          GetPosition(division_table, mv, reference_to_current_with_sign,
+                      x8_start, x8_end, x8, r_offsets, source_reference_type8,
+                      skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+                      &position_y8, &position_x8, &skip_64, mvs);
+          // Early termination #2 if all are skips.
+          if (skip_64 != -1) {
+            const int16x8_t p_y = vmovl_s8(position_y8);
+            const int16x8_t p_x = vmovl_s8(position_x8);
+            const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+            const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+            // Store up to 7 elements since leftover is at most 7.
+            if (skip_64 == 0) {
+              // Store all.
+              Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+            } else {
+              // Check and store each.
+              // The compiler is smart enough to not create the local buffer
+              // skips[].
+              int8_t skips[8];
+              memcpy(skips, &skip_64, sizeof(skips));
+              CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+            }
+          }
+        }
+      } else {
+        for (; x8 < adjusted_x8_end; ++x8) {
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
+          MotionVector projection_mv;
+          // reference_to_current_with_sign could be 0.
+          GetMvProjection(mv[x8], reference_to_current_with_sign,
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
+          // Do not update the motion vector if the block position is not valid
+          // or if position_x8 is outside the current range of x8_start and
+          // x8_end. Note that position_y8 will always be within the range of
+          // y8_start and y8_end.
+          const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+          if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+          const int x8_base = x8 & ~7;
+          const int x8_floor =
+              std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+          const int x8_ceiling =
+              std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+          const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+          if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+          dst_mv[position_y8 * stride + position_x8] = mv[x8];
+          dst_reference_offset[position_y8 * stride + position_x8] =
+              reference_offsets[source_reference_type];
+        }
+      }
+    }
+
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+}  // namespace
+
+void MotionFieldProjectionInit_NEON() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_field_projection_neon.h b/src/dsp/arm/motion_field_projection_neon.h
new file mode 100644 (file)
index 0000000..41ab6a6
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
new file mode 100644 (file)
index 0000000..4720879
--- /dev/null
@@ -0,0 +1,256 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+                              const int32x4_t numerator) {
+  const int32x4_t m0 = vmull_s16(mv, denominator);
+  const int32x4_t m = vmulq_s32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+  return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x4_t MvProjectionCompound(const int16x4_t mv,
+                                      const int temporal_reference_offsets,
+                                      const int reference_offsets[2]) {
+  const int16x4_t denominator =
+      vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
+  const int32x2_t offset = vld1_s32(reference_offsets);
+  const int32x2x2_t offsets = vzip_s32(offset, offset);
+  const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
+  return MvProjection(mv, denominator, numerator);
+}
+
+inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
+  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+  const int16x8_t mv = vcombine_s16(mv0, mv1);
+  const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
+  return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int16x8_t MvProjectionCompoundClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2]) {
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const int32x2_t temporal_mv = vld1_s32(tmvs);
+  const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
+  const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
+  const int16x4_t mv0 = MvProjectionCompound(
+      tmv0, temporal_reference_offsets[0], reference_offsets);
+  const int16x4_t mv1 = MvProjectionCompound(
+      tmv1, temporal_reference_offsets[1], reference_offsets);
+  return ProjectionClip(mv0, mv1);
+}
+
+inline int16x8_t MvProjectionSingleClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, int16x4_t* const lookup) {
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const int16x8_t temporal_mv = vld1q_s16(tmvs);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
+  const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
+  const int16x4_t tmv0 = vget_low_s16(temporal_mv);
+  const int16x4_t tmv1 = vget_high_s16(temporal_mv);
+  const int32x4_t numerator = vdupq_n_s32(reference_offset);
+  const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
+  const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
+  return ProjectionClip(mv0, mv1);
+}
+
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(1);
+  const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+  const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+  const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
+  vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
+}
+
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(7);
+  const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+  const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+  const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+  const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
+  vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
+}
+
+void MvProjectionCompoundLowPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    LowPrecision(mv, candidate_mvs);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionCompoundForceInteger_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    ForceInteger(mv, candidate_mvs);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionCompoundHighPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionSingleLowPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    LowPrecision(mv, candidate_mvs);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionSingleForceInteger_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    ForceInteger(mv, candidate_mvs);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionSingleHighPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count != 0);
+}
+
+}  // namespace
+
+void MotionVectorSearchInit_NEON() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_vector_search_neon.h b/src/dsp/arm/motion_vector_search_neon.h
new file mode 100644 (file)
index 0000000..19b4519
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
new file mode 100644 (file)
index 0000000..271bbaa
--- /dev/null
@@ -0,0 +1,953 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+#include "src/dsp/obmc.inc"
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
+                           const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+                           const uint8x8_t pred_mask,
+                           const uint8x8_t obmc_pred_mask) {
+  const uint8x8_t pred_val = Load4(pred);
+  const uint8x8_t obmc_pred_val = Load4(obmc_pred);
+  const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+  const uint8x8_t result =
+      vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  StoreLo4(pred, result);
+}
+
+inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred,
+                           const uint8x8_t obmc_pred_val,
+                           const uint8x8_t pred_mask,
+                           const uint8x8_t obmc_pred_mask) {
+  const uint8x8_t pred_val = vld1_u8(pred);
+  const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+  const uint8x8_t result =
+      vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  vst1_u8(pred, result);
+}
+
+inline void OverlapBlendFromLeft2xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = Load2(kObmcMask);
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  uint8x8_t pred_val = vdup_n_u8(0);
+  uint8x8_t obmc_pred_val = vdup_n_u8(0);
+  int y = 0;
+  do {
+    pred_val = Load2<0>(pred, pred_val);
+    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+    obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
+    const uint8x8_t result =
+        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+    Store2<0>(pred, result);
+
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y != height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = Load4(kObmcMask + 2);
+  // 64 - mask
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    y += 2;
+  } while (y != height);
+}
+
+inline void OverlapBlendFromLeft8xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+  constexpr int obmc_prediction_stride = 8;
+  // 64 - mask
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+    WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+
+    WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask,
+                   obmc_pred_mask);
+    pred += prediction_stride;
+
+    obmc_pred += obmc_prediction_stride << 1;
+    y += 2;
+  } while (y != height);
+}
+
+void OverlapBlendFromLeft_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+  const uint8x16_t mask_inverter = vdupq_n_u8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint8_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+    const uint8x16_t pred_mask = vld1q_u8(mask + x);
+    // 64 - mask
+    const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
+    int y = 0;
+    do {
+      const uint8x16_t pred_val = vld1q_u8(pred);
+      const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+      const uint16x8_t weighted_pred_lo =
+          vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
+      const uint8x8_t result_lo =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
+                                vget_low_u8(obmc_pred_val)),
+                       6);
+      const uint16x8_t weighted_pred_hi =
+          vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
+      const uint8x8_t result_hi =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
+                                vget_high_u8(obmc_pred_val)),
+                       6);
+      vst1q_u8(pred, vcombine_u8(result_lo, result_hi));
+
+      pred += prediction_stride;
+      obmc_pred += obmc_prediction_stride;
+    } while (++y < height);
+    x += 16;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4x4_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride, const int height) {
+  uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  if (height == 2) {
+    return;
+  }
+
+  pred_mask = vdup_n_u8(kObmcMask[3]);
+  obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  pred_mask = vdup_n_u8(kObmcMask[4]);
+  obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  if (height < 8) {
+    OverlapBlendFromTop4x4_NEON(pred, prediction_stride, obmc_pred,
+                                obmc_prediction_stride, height);
+    return;
+  }
+  const uint8_t* mask = kObmcMask + height - 2;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  int y = 0;
+  // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+  // lines are unchanged as the corresponding mask value is 64.
+  do {
+    uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 1]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 2]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 3]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 4]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 5]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    // Increment for the right mask index.
+    y += 6;
+  } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
+  constexpr int obmc_prediction_stride = 8;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0);
+    const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+
+    WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0,
+                   obmc_pred_mask0);
+    pred += prediction_stride;
+    ++y;
+
+    const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1);
+    WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1,
+                   obmc_pred_mask1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+  } while (++y < compute_height);
+}
+
+void OverlapBlendFromTop_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                obmc_prediction_stride);
+    return;
+  }
+
+  if (width == 8) {
+    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+
+  const uint8_t* mask = kObmcMask + height - 2;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  // Stop when mask value becomes 64. This is inferred for 4xH.
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    int x = 0;
+    do {
+      const uint8x16_t pred_val = vld1q_u8(pred + x);
+      const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
+      const uint16x8_t weighted_pred_lo =
+          vmull_u8(pred_mask, vget_low_u8(pred_val));
+      const uint8x8_t result_lo =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
+                                vget_low_u8(obmc_pred_val)),
+                       6);
+      const uint16x8_t weighted_pred_hi =
+          vmull_u8(pred_mask, vget_high_u8(pred_val));
+      const uint8x8_t result_hi =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
+                                vget_high_u8(obmc_pred_val)),
+                       6);
+      vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));
+
+      x += 16;
+    } while (x < width);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y < compute_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2. The value 64 leaves the result
+// equal to |pred| and may be ignored if convenient. Vector loads may overrread
+// values meant for larger sizes, but these values will be unused.
+constexpr uint16_t kObmcMask[62] = {
+    // Obmc Mask 2
+    45, 64,
+    // Obmc Mask 4
+    39, 50, 59, 64,
+    // Obmc Mask 8
+    36, 42, 48, 53, 57, 61, 64, 64,
+    // Obmc Mask 16
+    34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+    // Obmc Mask 32
+    33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+    59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
+
+inline uint16x4_t BlendObmc2Or4(uint16_t* const pred,
+                                const uint16x4_t obmc_pred_val,
+                                const uint16x4_t pred_mask,
+                                const uint16x4_t obmc_pred_mask) {
+  const uint16x4_t pred_val = vld1_u16(pred);
+  const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
+  const uint16x4_t result =
+      vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  return result;
+}
+
+inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred,
+                             const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
+                             const uint16x8_t pred_mask,
+                             const uint16x8_t obmc_pred_mask) {
+  const uint16x8_t pred_val = vld1q_u16(pred);
+  const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+  const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
+  const uint16x8_t result =
+      vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  return result;
+}
+
+inline void OverlapBlendFromLeft2xH_NEON(
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+  constexpr int obmc_prediction_stride = 2;
+  const uint16x4_t mask_inverter = vdup_n_u16(64);
+  // Second two lanes unused.
+  const uint16x4_t pred_mask = vld1_u16(kObmcMask);
+  const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred);
+    const uint16x4_t result_0 =
+        BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask);
+    Store2<0>(pred, result_0);
+
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride;
+
+    const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred);
+    const uint16x4_t result_1 =
+        BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask);
+    Store2<0>(pred, result_1);
+
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride;
+
+    y += 2;
+  } while (y != height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+  constexpr int obmc_prediction_stride = 4;
+  const uint16x4_t mask_inverter = vdup_n_u16(64);
+  const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
+  // 64 - mask
+  const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+    const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val),
+                                              pred_mask, obmc_pred_mask);
+    vst1_u16(pred, result_0);
+    pred = AddByteStride(pred, prediction_stride);
+
+    const uint16x4_t result_1 = BlendObmc2Or4(
+        pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask);
+    vst1_u16(pred, result_1);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
+
+    y += 2;
+  } while (y != height);
+}
+
+void OverlapBlendFromLeft_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  const uint16_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    uint16_t* pred_x = pred + x;
+    const uint16_t* obmc_pred_x = obmc_pred + x;
+    const uint16x8_t pred_mask = vld1q_u16(mask + x);
+    // 64 - mask
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    int y = 0;
+    do {
+      const uint16x8_t result =
+          BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask);
+      vst1q_u16(pred_x, result);
+
+      pred_x = AddByteStride(pred_x, prediction_stride);
+      obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride);
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int lane>
+inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred,
+                                    const uint16x4_t obmc_pred_val,
+                                    const uint16x8_t pred_mask,
+                                    const uint16x8_t obmc_pred_mask) {
+  const uint16x4_t pred_val = vld1_u16(pred);
+  const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
+  const uint16x4_t result = vrshr_n_u16(
+      VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
+  return result;
+}
+
+template <int lane>
+inline uint16x8_t BlendObmcFromTop8(
+    uint16_t* LIBGAV1_RESTRICT const pred,
+    const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
+    const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) {
+  const uint16x8_t pred_val = vld1q_u16(pred);
+  const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+  const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
+  const uint16x8_t result = vrshrq_n_u16(
+      VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
+  return result;
+}
+
+inline void OverlapBlendFromTop4x2Or4_NEON(
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
+  constexpr int obmc_prediction_stride = 4;
+  const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+  const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred);
+  uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0),
+                                           pred_mask, obmc_pred_mask);
+  vst1_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+
+  if (height == 2) {
+    // Mask value is 64, meaning |pred| is unchanged.
+    return;
+  }
+
+  result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask,
+                                obmc_pred_mask);
+  vst1_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride << 1;
+
+  const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred);
+  result =
+      BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask);
+  vst1_u16(pred, result);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+  if (height < 8) {
+    OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height);
+    return;
+  }
+  constexpr int obmc_prediction_stride = 4;
+  const uint16_t* mask = kObmcMask + height - 2;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  int y = 0;
+  // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+  // lines are unchanged as the corresponding mask value is 64.
+  do {
+    const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    // Load obmc row 0, 1.
+    uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+    uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val),
+                                             pred_mask, obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+
+    result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
+
+    // Load obmc row 2, 3.
+    obmc_pred_val = vld1q_u16(obmc_pred);
+    result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+
+    result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
+
+    // Load obmc row 4, 5.
+    obmc_pred_val = vld1q_u16(obmc_pred);
+    result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+
+    result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
+
+    // Increment for the right mask index.
+    y += 6;
+  } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
+  const uint16_t* mask = kObmcMask + height - 2;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  uint16x8_t pred_mask = vld1q_u16(mask);
+  uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+  uint16x8_t result =
+      BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  if (height == 2) return;
+
+  constexpr int obmc_prediction_stride = 8;
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  if (height == 4) return;
+
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+
+  if (height == 8) return;
+
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  pred_mask = vld1q_u16(&mask[8]);
+  obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+
+  result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+
+  if (height == 16) return;
+
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  pred_mask = vld1q_u16(&mask[16]);
+  obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+
+  result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(pred, result);
+}
+
+void OverlapBlendFromTop_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+
+  if (width == 8) {
+    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height);
+    return;
+  }
+
+  const uint16_t* mask = kObmcMask + height - 2;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  const uint16x8_t pred_mask = vld1q_u16(mask);
+  // 64 - mask
+  const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+#define OBMC_ROW_FROM_TOP(n)                                   \
+  do {                                                         \
+    int x = 0;                                                 \
+    do {                                                       \
+      const uint16x8_t result = BlendObmcFromTop8<n>(          \
+          pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \
+      vst1q_u16(pred + x, result);                             \
+                                                               \
+      x += 8;                                                  \
+    } while (x < width);                                       \
+  } while (false)
+
+  // Compute 1 row.
+  if (height == 2) {
+    OBMC_ROW_FROM_TOP(0);
+    return;
+  }
+
+  // Compute 3 rows.
+  if (height == 4) {
+    OBMC_ROW_FROM_TOP(0);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(1);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(2);
+    return;
+  }
+
+  // Compute 6 rows.
+  if (height == 8) {
+    OBMC_ROW_FROM_TOP(0);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(1);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(2);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(3);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(4);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(5);
+    return;
+  }
+
+  // Compute 12 rows.
+  if (height == 16) {
+    OBMC_ROW_FROM_TOP(0);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(1);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(2);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(3);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(4);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(5);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(6);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(7);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+
+    const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
+    // 64 - mask
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    OBMC_ROW_FROM_TOP(0);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(1);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(2);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(3);
+    return;
+  }
+
+  // Stop when mask value becomes 64. This is a multiple of 8 for height 32
+  // and 64.
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
+    // 64 - mask
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    OBMC_ROW_FROM_TOP(0);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(1);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(2);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(3);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(4);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(5);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(6);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+    OBMC_ROW_FROM_TOP(7);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+
+    y += 8;
+  } while (y < compute_height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/obmc_neon.h b/src/dsp/arm/obmc_neon.h
new file mode 100644 (file)
index 0000000..788017e
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If NEON is enabled, signal the NEON implementation should be used.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
new file mode 100644 (file)
index 0000000..2f8dde6
--- /dev/null
@@ -0,0 +1,318 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint8x8_t filter[8];
+    uint8x16_t d[kSuperResFilterTaps / 2];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      filter[i] =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+    }
+    Transpose8x8(filter, d);
+    vst1q_u8(dst, d[0]);
+    dst += 16;
+    vst1q_u8(dst, d[1]);
+    dst += 16;
+    vst1q_u8(dst, d[2]);
+    dst += 16;
+    vst1q_u8(dst, d[3]);
+    dst += 16;
+  } while (--x != 0);
+}
+
+// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+// Maximum sum: 255*171 == 0xAA55
+// The sum is clipped to [0, 255], so adding all positive and then
+// subtracting all negative with saturation is sufficient.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
+                          const uint8_t** coefficients) {
+  uint8x16_t f[kSuperResFilterTaps / 2];
+  for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
+    f[i] = vld1q_u8(*coefficients);
+  }
+  uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
+  res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
+  res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
+  res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
+  uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
+  temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
+  temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
+  temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
+  res = vqsubq_u16(res, temp);
+  return vqrshrn_n_u16(res, kFilterBits);
+}
+
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+                   void* LIBGAV1_RESTRICT const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* LIBGAV1_RESTRICT const dest,
+                   const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+#if LIBGAV1_MSAN
+    // Initialize the padding area to prevent msan warnings.
+    const int super_res_right_border = kSuperResHorizontalPadding;
+#else
+    const int super_res_right_border = kSuperResHorizontalBorder;
+#endif
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, super_res_right_border);
+    int subpixel_x = initial_subpixel_x;
+    uint8x8_t sr[8];
+    uint8x16_t s[8];
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    // The below code calculates up to 15 extra upscaled
+    // pixels which will over-read up to 15 downscaled pixels in the end of each
+    // row. kSuperResHorizontalPadding accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+        s[i] = vcombine_u8(sr[i], s_hi);
+      }
+      Transpose8x16(s);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_low_u8(s[0]);
+      sr[1] = vget_low_u8(s[1]);
+      sr[2] = vget_low_u8(s[2]);
+      sr[3] = vget_low_u8(s[3]);
+      sr[4] = vget_low_u8(s[4]);
+      sr[5] = vget_low_u8(s[5]);
+      sr[6] = vget_low_u8(s[6]);
+      sr[7] = vget_low_u8(s[7]);
+      const uint8x8_t d0 = SuperRes(sr, &filter);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_high_u8(s[0]);
+      sr[1] = vget_high_u8(s[1]);
+      sr[2] = vget_high_u8(s[2]);
+      sr[3] = vget_high_u8(s[3]);
+      sr[4] = vget_high_u8(s[4]);
+      sr[5] = vget_high_u8(s[5]);
+      sr[6] = vget_high_u8(s[6]);
+      sr[7] = vget_high_u8(s[7]);
+      const uint8x8_t d1 = SuperRes(sr, &filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint16x8_t filter[8];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      const uint8x8_t filter_8 =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+      // uint8_t -> uint16_t
+      filter[i] = vmovl_u8(filter_8);
+    }
+
+    Transpose8x8(filter);
+
+    vst1q_u16(dst, filter[0]);
+    dst += 8;
+    vst1q_u16(dst, filter[1]);
+    dst += 8;
+    vst1q_u16(dst, filter[2]);
+    dst += 8;
+    vst1q_u16(dst, filter[3]);
+    dst += 8;
+    vst1q_u16(dst, filter[4]);
+    dst += 8;
+    vst1q_u16(dst, filter[5]);
+    dst += 8;
+    vst1q_u16(dst, filter[6]);
+    dst += 8;
+    vst1q_u16(dst, filter[7]);
+    dst += 8;
+  } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+                           const uint16_t** coefficients, int bitdepth) {
+  uint16x8_t f[kSuperResFilterTaps];
+  for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+    f[i] = vld1q_u16(*coefficients);
+  }
+
+  uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+  uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+  res_lo = vqsubq_u32(res_lo, temp_lo);
+
+  uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+
+  uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+  res_hi = vqsubq_u32(res_hi, temp_hi);
+
+  const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+                                      vqrshrn_n_u32(res_hi, kFilterBits));
+
+  // Clip the result at (1 << bd) - 1.
+  return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+                   void* LIBGAV1_RESTRICT const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* LIBGAV1_RESTRICT const dest,
+                   const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+#if LIBGAV1_MSAN
+    // Initialize the padding area to prevent msan warnings.
+    const int super_res_right_border = kSuperResHorizontalPadding;
+#else
+    const int super_res_right_border = kSuperResHorizontalBorder;
+#endif
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, super_res_right_border);
+    int subpixel_x = initial_subpixel_x;
+    uint16x8_t sr[8];
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+
+      Transpose8x8(sr);
+
+      const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+      vst1q_u16(dst_ptr, d0);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON<10>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
new file mode 100644 (file)
index 0000000..65e48c5
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
new file mode 100644 (file)
index 0000000..da380b1
--- /dev/null
@@ -0,0 +1,907 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+    (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+//
+// src_row_centered contains 16 "centered" samples of a source row. (We center
+// the samples by subtracting 128 from the samples.)
+void HorizontalFilter(const int sx4, const int16_t alpha,
+                      const int8x16_t src_row_centered,
+                      int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  int8x8_t filter[8];
+  for (auto& f : filter) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    f = vld1_s8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+  Transpose8x8(filter);
+  // Add kFirstPassOffset to ensure |sum| stays within uint16_t.
+  // Add 128 (offset) * 128 (filter sum) (also 1 << 14) to account for the
+  // centering of the source samples. These combined are 1 << 15 or -32768.
+  int16x8_t sum =
+      vdupq_n_s16(static_cast<int16_t>(kFirstPassOffset + 128 * 128));
+  // Unrolled k = 0..7 loop. We need to manually unroll the loop because the
+  // third argument (an index value) to vextq_s8() must be a constant
+  // (immediate). src_row_window is a sliding window of length 8 into
+  // src_row_centered.
+  // k = 0.
+  int8x8_t src_row_window = vget_low_s8(src_row_centered);
+  sum = vmlal_s8(sum, filter[0], src_row_window);
+  // k = 1.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 1));
+  sum = vmlal_s8(sum, filter[1], src_row_window);
+  // k = 2.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 2));
+  sum = vmlal_s8(sum, filter[2], src_row_window);
+  // k = 3.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 3));
+  sum = vmlal_s8(sum, filter[3], src_row_window);
+  // k = 4.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 4));
+  sum = vmlal_s8(sum, filter[4], src_row_window);
+  // k = 5.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 5));
+  sum = vmlal_s8(sum, filter[5], src_row_window);
+  // k = 6.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 6));
+  sum = vmlal_s8(sum, filter[6], src_row_window);
+  // k = 7.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 7));
+  sum = vmlal_s8(sum, filter[7], src_row_window);
+  // End of unrolled k = 0..7 loop.
+  // Due to the offset |sum| is guaranteed to be unsigned.
+  uint16x8_t sum_unsigned = vreinterpretq_u16_s16(sum);
+  sum_unsigned = vrshrq_n_u16(sum_unsigned, kInterRoundBitsHorizontal);
+  // After the shift |sum_unsigned| will fit into int16_t.
+  vst1q_s16(intermediate_result_row, vreinterpretq_s16_u16(sum_unsigned));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
+               const ptrdiff_t source_stride, const int source_width,
+               const int source_height,
+               const int* LIBGAV1_RESTRICT const warp_params,
+               const int subsampling_x, const int subsampling_y,
+               const int block_start_x, const int block_start_y,
+               const int block_width, const int block_height,
+               const int16_t alpha, const int16_t beta, const int16_t gamma,
+               const int16_t delta, void* LIBGAV1_RESTRICT dest,
+               const ptrdiff_t dest_stride) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can multiply it by kWarpedFilters (which has signed
+    // values) using vmlal_s16().
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const auto* const src = static_cast<const uint8_t*>(source);
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint8_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block.
+  int start_y = block_start_y;
+  do {
+    int start_x = block_start_x;
+    do {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const WarpFilterParams filter_params = GetWarpFilterParams(
+          src_x, src_y, subsampling_x, subsampling_y, warp_params);
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (filter_params.ix4 - 7 >= source_width - 1 ||
+          filter_params.ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const uint8_t* first_row_border =
+            (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (filter_params.iy4 - 7 >= source_height - 1 ||
+            filter_params.iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const uint8_t row_border_pixel =
+              first_row_border[row * source_stride];
+
+          DestType* dst_row = dst + start_x - block_start_x;
+          for (int y = 0; y < 8; ++y) {
+            if (is_compound) {
+              const int16x8_t sum =
+                  vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+                                                   kRoundBitsVertical));
+              vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+            } else {
+              memset(dst_row, row_border_pixel, 8);
+            }
+            dst_row += dest_stride;
+          }
+          // End of region 1. Continue the |start_x| do-while loop.
+          start_x += 8;
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = filter_params.iy4 + y;
+          int sum = first_row_border[row * source_stride];
+          sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+          const int16x8_t intermediate =
+              vld1q_s16(&intermediate_result_column[y]);
+          int16_t tmp[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+            const int32x4_t product_low =
+                vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+            const int32x4_t product_high =
+                vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+            // vaddvq_s32 is only available on __aarch64__.
+            const int32_t sum =
+                vaddvq_s32(product_low) + vaddvq_s32(product_high);
+            const int16_t sum_descale =
+                RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              dst_row[x] = sum_descale;
+            } else {
+              tmp[x] = sum_descale;
+            }
+            sy += gamma;
+          }
+          if (!is_compound) {
+            const int16x8_t sum = vld1q_s16(tmp);
+            vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+          }
+#else   // !defined(__aarch64__)
+          int16x8_t filter[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            filter[x] = vld1q_s16(kWarpedFilters[offset]);
+            sy += gamma;
+          }
+          Transpose8x8(filter);
+          int32x4_t sum_low = vdupq_n_s32(0);
+          int32x4_t sum_high = sum_low;
+          for (int k = 0; k < 8; ++k) {
+            const int16_t intermediate = intermediate_result_column[y + k];
+            sum_low =
+                vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+            sum_high =
+                vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+          }
+          const int16x8_t sum =
+              vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                           vrshrn_n_s32(sum_high, kRoundBitsVertical));
+          if (is_compound) {
+            vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+          } else {
+            vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+          }
+#endif  // defined(__aarch64__)
+          dst_row += dest_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| do-while loop.
+        start_x += 8;
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (filter_params.iy4 - 7 >= source_height - 1 ||
+          filter_params.iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const uint8_t* const src_row = src + row * source_stride;
+        // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+        // read but is ignored.
+        //
+        // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+        // bytes after src_row[source_width - 1]. We assume the source frame
+        // has left and right borders of at least 13 bytes that extend the
+        // frame boundary pixels. We also assume there is at least one extra
+        // padding byte after the right border of the last source row.
+        const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]);
+        // Convert src_row_v to int8 (subtract 128).
+        const int8x16_t src_row_centered =
+            vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          HorizontalFilter(sx4, alpha, src_row_centered,
+                           intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = filter_params.iy4 + y;
+          const uint8_t* const src_row = src + row * source_stride;
+          // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+          // read but is ignored.
+          //
+          // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+          // bytes after src_row[source_width - 1]. We assume the source frame
+          // has left and right borders of at least 13 bytes that extend the
+          // frame boundary pixels. We also assume there is at least one extra
+          // padding byte after the right border of the last source row.
+          const uint8x16_t src_row_v =
+              vld1q_u8(&src_row[filter_params.ix4 - 7]);
+          // Convert src_row_v to int8 (subtract 128).
+          const int8x16_t src_row_centered =
+              vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+          HorizontalFilter(sx4, alpha, src_row_centered,
+                           intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                MultiplyBy4(delta);
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        int16x8_t filter[8];
+        for (auto& f : filter) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          f = vld1q_s16(kWarpedFilters[offset]);
+          sy += gamma;
+        }
+        Transpose8x8(filter);
+        int32x4_t sum_low = vdupq_n_s32(-kOffsetRemoval);
+        int32x4_t sum_high = sum_low;
+        for (int k = 0; k < 8; ++k) {
+          const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+          sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+                              vget_low_s16(intermediate));
+          sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+                               vget_high_s16(intermediate));
+        }
+        const int16x8_t sum =
+            vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                         vrshrn_n_s32(sum_high, kRoundBitsVertical));
+        if (is_compound) {
+          vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+        } else {
+          vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+        }
+        dst_row += dest_stride;
+        sy4 += delta;
+      }
+      start_x += 8;
+    } while (start_x < block_start_x + block_width);
+    dst += 8 * dest_stride;
+    start_y += 8;
+  } while (start_y < block_start_y + block_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_NEON</*is_compound=*/false>;
+  dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+LIBGAV1_ALWAYS_INLINE uint16x8x2_t LoadSrcRow(uint16_t const* ptr) {
+  uint16x8x2_t x;
+  // Clang/gcc uses ldp here.
+  x.val[0] = vld1q_u16(ptr);
+  x.val[1] = vld1q_u16(ptr + 8);
+  return x;
+}
+
+LIBGAV1_ALWAYS_INLINE void HorizontalFilter(
+    const int sx4, const int16_t alpha, const uint16x8x2_t src_row,
+    int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  int8x8_t filter8[8];
+  for (auto& f : filter8) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    f = vld1_s8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+
+  Transpose8x8(filter8);
+
+  int16x8_t filter[8];
+  for (int i = 0; i < 8; ++i) {
+    filter[i] = vmovl_s8(filter8[i]);
+  }
+
+  int32x4x2_t sum;
+  int16x8_t src_row_window;
+  // k = 0.
+  src_row_window = vreinterpretq_s16_u16(src_row.val[0]);
+  sum.val[0] = vmull_s16(vget_low_s16(filter[0]), vget_low_s16(src_row_window));
+  sum.val[1] = VMullHighS16(filter[0], src_row_window);
+  // k = 1.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 1));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[1]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[1], src_row_window);
+  // k = 2.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 2));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[2]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[2], src_row_window);
+  // k = 3.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 3));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[3]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[3], src_row_window);
+  // k = 4.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 4));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[4]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[4], src_row_window);
+  // k = 5.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 5));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[5]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[5], src_row_window);
+  // k = 6.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 6));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[6]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[6], src_row_window);
+  // k = 7.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 7));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[7]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[7], src_row_window);
+  // End of unrolled k = 0..7 loop.
+
+  vst1_s16(intermediate_result_row,
+           vrshrn_n_s32(sum.val[0], kInterRoundBitsHorizontal));
+  vst1_s16(intermediate_result_row + 4,
+           vrshrn_n_s32(sum.val[1], kInterRoundBitsHorizontal));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
+               const ptrdiff_t source_stride, const int source_width,
+               const int source_height,
+               const int* LIBGAV1_RESTRICT const warp_params,
+               const int subsampling_x, const int subsampling_y,
+               const int block_start_x, const int block_start_y,
+               const int block_width, const int block_height,
+               const int16_t alpha, const int16_t beta, const int16_t gamma,
+               const int16_t delta, void* LIBGAV1_RESTRICT dest,
+               const ptrdiff_t dest_stride) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can multiply it by kWarpedFilters (which has signed
+    // values) using vmlal_s16().
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = source_stride >> 1;
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint16_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+  const ptrdiff_t dst_stride = is_compound ? dest_stride : dest_stride >> 1;
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block.
+  int start_y = block_start_y;
+  do {
+    int start_x = block_start_x;
+    do {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const WarpFilterParams filter_params = GetWarpFilterParams(
+          src_x, src_y, subsampling_x, subsampling_y, warp_params);
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (filter_params.ix4 - 7 >= source_width - 1 ||
+          filter_params.ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const uint16_t* first_row_border =
+            (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (filter_params.iy4 - 7 >= source_height - 1 ||
+            filter_params.iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const uint16_t row_border_pixel = first_row_border[row * src_stride];
+
+          DestType* dst_row = dst + start_x - block_start_x;
+          for (int y = 0; y < 8; ++y) {
+            if (is_compound) {
+              const int16x8_t sum =
+                  vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+                                                   kRoundBitsVertical));
+              vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+                        vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+            } else {
+              vst1q_u16(reinterpret_cast<uint16_t*>(dst_row),
+                        vdupq_n_u16(row_border_pixel));
+            }
+            dst_row += dst_stride;
+          }
+          // End of region 1. Continue the |start_x| do-while loop.
+          start_x += 8;
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = filter_params.iy4 + y;
+          int sum = first_row_border[row * src_stride];
+          sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+          const int16x8_t intermediate =
+              vld1q_s16(&intermediate_result_column[y]);
+          int16_t tmp[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+            const int32x4_t product_low =
+                vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+            const int32x4_t product_high =
+                vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+            // vaddvq_s32 is only available on __aarch64__.
+            const int32_t sum =
+                vaddvq_s32(product_low) + vaddvq_s32(product_high);
+            const int16_t sum_descale =
+                RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              dst_row[x] = sum_descale + kCompoundOffset;
+            } else {
+              tmp[x] = sum_descale;
+            }
+            sy += gamma;
+          }
+          if (!is_compound) {
+            const uint16x8_t v_max_bitdepth =
+                vdupq_n_u16((1 << kBitdepth10) - 1);
+            const int16x8_t sum = vld1q_s16(tmp);
+            const uint16x8_t d0 =
+                vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum, vdupq_n_s16(0))),
+                          v_max_bitdepth);
+            vst1q_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+          }
+#else   // !defined(__aarch64__)
+          int16x8_t filter[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            filter[x] = vld1q_s16(kWarpedFilters[offset]);
+            sy += gamma;
+          }
+          Transpose8x8(filter);
+          int32x4_t sum_low = vdupq_n_s32(0);
+          int32x4_t sum_high = sum_low;
+          for (int k = 0; k < 8; ++k) {
+            const int16_t intermediate = intermediate_result_column[y + k];
+            sum_low =
+                vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+            sum_high =
+                vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+          }
+          if (is_compound) {
+            const int16x8_t sum =
+                vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                             vrshrn_n_s32(sum_high, kRoundBitsVertical));
+            vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+                      vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+          } else {
+            const uint16x4_t v_max_bitdepth =
+                vdup_n_u16((1 << kBitdepth10) - 1);
+            const uint16x4_t d0 = vmin_u16(
+                vqrshrun_n_s32(sum_low, kRoundBitsVertical), v_max_bitdepth);
+            const uint16x4_t d1 = vmin_u16(
+                vqrshrun_n_s32(sum_high, kRoundBitsVertical), v_max_bitdepth);
+            vst1_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+            vst1_u16(reinterpret_cast<uint16_t*>(dst_row + 4), d1);
+          }
+#endif  // defined(__aarch64__)
+          dst_row += dst_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| do-while loop.
+        start_x += 8;
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (filter_params.iy4 - 7 >= source_height - 1 ||
+          filter_params.iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const uint16_t* const src_row = src + row * src_stride;
+        // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+        // read but is ignored.
+        //
+        // NOTE: This may read up to 13 pixels before src_row[0] or up to 14
+        // pixels after src_row[source_width - 1]. We assume the source frame
+        // has left and right borders of at least 13 pixels that extend the
+        // frame boundary pixels. We also assume there is at least one extra
+        // padding pixel after the right border of the last source row.
+        const uint16x8x2_t src_row_v =
+            LoadSrcRow(&src_row[filter_params.ix4 - 7]);
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = filter_params.iy4 + y;
+          const uint16_t* const src_row = src + row * src_stride;
+          // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+          // read but is ignored.
+          //
+          // NOTE: This may read up to pixels bytes before src_row[0] or up to
+          // 14 pixels after src_row[source_width - 1]. We assume the source
+          // frame has left and right borders of at least 13 pixels that extend
+          // the frame boundary pixels. We also assume there is at least one
+          // extra padding pixel after the right border of the last source row.
+          const uint16x8x2_t src_row_v =
+              LoadSrcRow(&src_row[filter_params.ix4 - 7]);
+          HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                MultiplyBy4(delta);
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        int16x8_t filter[8];
+        for (auto& f : filter) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          f = vld1q_s16(kWarpedFilters[offset]);
+          sy += gamma;
+        }
+        Transpose8x8(filter);
+        int32x4_t sum_low = vdupq_n_s32(0);
+        int32x4_t sum_high = sum_low;
+        for (int k = 0; k < 8; ++k) {
+          const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+          sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+                              vget_low_s16(intermediate));
+          sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+                               vget_high_s16(intermediate));
+        }
+        if (is_compound) {
+          const int16x8_t sum =
+              vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                           vrshrn_n_s32(sum_high, kRoundBitsVertical));
+          vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+                    vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+        } else {
+          const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+          const uint16x4_t d0 = vmin_u16(
+              vqrshrun_n_s32(sum_low, kRoundBitsVertical), v_max_bitdepth);
+          const uint16x4_t d1 = vmin_u16(
+              vqrshrun_n_s32(sum_high, kRoundBitsVertical), v_max_bitdepth);
+          vst1_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+          vst1_u16(reinterpret_cast<uint16_t*>(dst_row + 4), d1);
+        }
+        dst_row += dst_stride;
+        sy4 += delta;
+      }
+      start_x += 8;
+    } while (start_x < block_start_x + block_width);
+    dst += 8 * dst_stride;
+    start_y += 8;
+  } while (start_y < block_start_y + block_height);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_NEON</*is_compound=*/false>;
+  dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WarpInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/warp_neon.h b/src/dsp/arm/warp_neon.h
new file mode 100644 (file)
index 0000000..cd60602
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WarpCompound LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
new file mode 100644 (file)
index 0000000..5ad6b97
--- /dev/null
@@ -0,0 +1,588 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/weight_mask_neon.h"
+
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8x2_t LoadPred(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                            const int16_t* LIBGAV1_RESTRICT prediction_1) {
+  const int16x8x2_t pred = {vld1q_s16(prediction_0), vld1q_s16(prediction_1)};
+  return pred;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline uint16x8x2_t LoadPred(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                             const uint16_t* LIBGAV1_RESTRICT prediction_1) {
+  const uint16x8x2_t pred = {vld1q_u16(prediction_0), vld1q_u16(prediction_1)};
+  return pred;
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+inline uint16x8_t AbsolutePredDifference(const int16x8x2_t pred) {
+  static_assert(bitdepth == 8, "");
+  constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+  return vrshrq_n_u16(
+      vreinterpretq_u16_s16(vabdq_s16(pred.val[0], pred.val[1])),
+      rounding_bits);
+}
+
+template <int bitdepth>
+inline uint16x8_t AbsolutePredDifference(const uint16x8x2_t pred) {
+  constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+  return vrshrq_n_u16(vabdq_u16(pred.val[0], pred.val[1]), rounding_bits);
+}
+
+template <bool mask_is_inverse, int bitdepth>
+inline void WeightMask8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask) {
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  using PredTypeVecx2 =
+      typename std::conditional<bitdepth == 8, int16x8x2_t, uint16x8x2_t>::type;
+  const PredTypeVecx2 pred =
+      LoadPred(static_cast<const PredType*>(prediction_0),
+               static_cast<const PredType*>(prediction_1));
+  const uint16x8_t difference = AbsolutePredDifference<bitdepth>(pred);
+  const uint8x8_t difference_offset = vdup_n_u8(38);
+  const uint8x8_t mask_ceiling = vdup_n_u8(64);
+  const uint8x8_t adjusted_difference =
+      vqadd_u8(vqshrn_n_u16(difference, 4), difference_offset);
+  const uint8x8_t mask_value = vmin_u8(adjusted_difference, mask_ceiling);
+  if (mask_is_inverse) {
+    const uint8x8_t inverted_mask_value = vsub_u8(mask_ceiling, mask_value);
+    vst1_u8(mask, inverted_mask_value);
+  } else {
+    vst1_u8(mask, mask_value);
+  }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+  WEIGHT8_WITHOUT_STRIDE;  \
+  pred_0 += 8;             \
+  pred_1 += 8;             \
+  mask += mask_stride
+
+// |pred_0| and |pred_1| are cast as int16_t* for the sake of pointer math. They
+// are uint16_t* for 10bpp and 12bpp, and this is handled in WeightMask8_NEON.
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                        const void* LIBGAV1_RESTRICT prediction_1,
+                        uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT8_AND_STRIDE;
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE                                      \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask); \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+  WEIGHT16_WITHOUT_STRIDE;  \
+  pred_0 += 16;             \
+  pred_1 += 16;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT16_AND_STRIDE;
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE                                         \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask);    \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8,   \
+                                              mask + 8);                \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 16, pred_1 + 16, \
+                                              mask + 16);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 24, pred_1 + 24, \
+                                              mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+  WEIGHT32_WITHOUT_STRIDE;  \
+  pred_0 += 32;             \
+  pred_1 += 32;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE                                         \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask);    \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8,   \
+                                              mask + 8);                \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 16, pred_1 + 16, \
+                                              mask + 16);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 24, pred_1 + 24, \
+                                              mask + 24);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 32, pred_1 + 32, \
+                                              mask + 32);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 40, pred_1 + 40, \
+                                              mask + 40);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 48, pred_1 + 48, \
+                                              mask + 48);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 56, pred_1 + 56, \
+                                              mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+  WEIGHT64_WITHOUT_STRIDE;  \
+  pred_0 += 64;             \
+  pred_1 += 64;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 42);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask128x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask128x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 42);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+#undef WEIGHT8_WITHOUT_STRIDE
+#undef WEIGHT8_AND_STRIDE
+#undef WEIGHT16_WITHOUT_STRIDE
+#undef WEIGHT16_AND_STRIDE
+#undef WEIGHT32_WITHOUT_STRIDE
+#undef WEIGHT32_AND_STRIDE
+#undef WEIGHT64_WITHOUT_STRIDE
+#undef WEIGHT64_AND_STRIDE
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                      \
+      WeightMask##width##x##height##_NEON<0, 8>;               \
+  dsp->weight_mask[w_index][h_index][1] =                      \
+      WeightMask##width##x##height##_NEON<1, 8>
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+#undef INIT_WEIGHT_MASK_8BPP
+
+}  // namespace
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                       \
+      WeightMask##width##x##height##_NEON<0, 10>;               \
+  dsp->weight_mask[w_index][h_index][1] =                       \
+      WeightMask##width##x##height##_NEON<1, 10>
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+#undef INIT_WEIGHT_MASK_10BPP
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+void WeightMaskInit_NEON() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/weight_mask_neon.h b/src/dsp/arm/weight_mask_neon.h
new file mode 100644 (file)
index 0000000..573f7de
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
new file mode 100644 (file)
index 0000000..1a37aa1
--- /dev/null
@@ -0,0 +1,118 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void AverageBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                    const void* LIBGAV1_RESTRICT prediction_1, const int width,
+                    const int height, void* const dest,
+                    const ptrdiff_t dest_stride) {
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // See warp.cc and convolve.cc for detailed prediction ranges.
+      int res = pred_0[x] + pred_1[x];
+      res -= (bitdepth == 8) ? 0 : kCompoundOffset + kCompoundOffset;
+      dst[x] = static_cast<Pixel>(
+          Clip3(RightShiftWithRounding(res, inter_post_round_bits + 1), 0,
+                (1 << bitdepth) - 1));
+    } while (++x < width);
+
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void AverageBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/average_blend.h b/src/dsp/average_blend.h
new file mode 100644 (file)
index 0000000..02ecd09
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+#define LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/average_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/average_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc
new file mode 100644 (file)
index 0000000..67d592f
--- /dev/null
@@ -0,0 +1,346 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e8;
+constexpr char kAverageBlend[] = "AverageBlend";
+// average_blend is applied to compound prediction values. This implies a range
+// far exceeding that of pixel values.
+// The ranges include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+template <int bitdepth, typename Pixel>
+class AverageBlendTest : public testing::TestWithParam<BlockSize>,
+                         public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  AverageBlendTest() = default;
+  ~AverageBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    AverageBlendInit_C();
+    DistanceWeightedBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_func_ = dsp->average_blend;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      AverageBlendInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      AverageBlendInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->average_blend;
+    dist_blend_func_ = dsp->distance_weighted_blend;
+  }
+
+ protected:
+  void Test(const char* digest, int num_tests, bool debug);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+  const int width_ = kBlockWidthPixels[GetParam()];
+  const int height_ = kBlockHeightPixels[GetParam()];
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+      {};
+  dsp::AverageBlendFunc base_func_;
+  dsp::AverageBlendFunc func_;
+  dsp::DistanceWeightedBlendFunc dist_blend_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void AverageBlendTest<bitdepth, Pixel>::Test(const char* digest, int num_tests,
+                                             bool debug) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  PredType* src_1 = source1_;
+  PredType* src_2 = source2_;
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+      const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+      const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+      src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+    }
+    src_1 += width_;
+    src_2 += width_;
+  }
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_tests; ++i) {
+    const absl::Time start = absl::Now();
+    func_(source1_, source2_, width_, height_, dest_,
+          sizeof(dest_[0]) * kDestStride);
+    elapsed_time += absl::Now() - start;
+  }
+  if (debug) {
+    if (base_func_ != nullptr) {
+      base_func_(source1_, source2_, width_, height_, reference_,
+                 sizeof(reference_[0]) * kDestStride);
+    } else {
+      // Use dist_blend_func_ as the base for C tests.
+      const int8_t weight = 8;
+      dist_blend_func_(source1_, source2_, weight, weight, width_, height_,
+                       reference_, sizeof(reference_[0]) * kDestStride);
+    }
+    EXPECT_TRUE(test_utils::CompareBlocks(dest_, reference_, width_, height_,
+                                          kDestStride, kDestStride, false));
+  }
+
+  test_utils::CheckMd5Digest(kAverageBlend, ToString(GetParam()), digest, dest_,
+                             sizeof(dest_[0]) * kDestStride * height_,
+                             elapsed_time);
+}
+
+const BlockSize kTestParam[] = {
+    kBlock4x4,    kBlock4x8,     kBlock4x16,  kBlock8x4,   kBlock8x8,
+    kBlock8x16,   kBlock8x32,    kBlock16x4,  kBlock16x8,  kBlock16x16,
+    kBlock16x32,  kBlock16x64,   kBlock32x8,  kBlock32x16, kBlock32x32,
+    kBlock32x64,  kBlock64x16,   kBlock64x32, kBlock64x64, kBlock64x128,
+    kBlock128x64, kBlock128x128,
+};
+
+using AverageBlendTest8bpp = AverageBlendTest<8, uint8_t>;
+
+const char* GetAverageBlendDigest8bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
+      "152bcc35946900b1ed16369b3e7a81b7",
+      "c23e9b5698f7384eaae30a3908118b77",
+      "f2da31d940f62490c368c03d32d3ede8",
+      // 8xN
+      "73c95485ef956e1d9ab914e88e6a202b",
+      "d90d3abd368e58c513070a88b34649ba",
+      "77f7d53d0edeffb3537afffd9ff33a4a",
+      "460b9b1e6b83f65f013cfcaf67ec0122",
+      // 16xN
+      "96454a56de940174ff92e9bb686d6d38",
+      "a50e268e93b48ae39cc2a47d377410e2",
+      "65c8502ff6d78065d466f9911ed6bb3e",
+      "bc2c873b9f5d74b396e1df705e87f699",
+      "b4dae656484b2d255d1e09b7f34e12c1",
+      // 32xN
+      "7e1e5db92b22a96e5226a23de883d766",
+      "ca40d46d89773e7f858b15fcecd43cc0",
+      "bfdc894707323f4dc43d1326309f8368",
+      "f4733417621719b7feba3166ec0da5b9",
+      // 64xN
+      "378fa0594d22f01c8e8931c2a908d7c4",
+      "db38fe2e082bd4a09acb3bb1d52ee11e",
+      "3ad44401cc731215c46c9b7d96f7e4ae",
+      "6c43267be5ed03d204a05fe36090f870",
+      // 128xN
+      "c8cfe46ebf166c1cbf08e8804206aadb",
+      "b0557b5156d2334c8ce4a7ee12f9d6b4",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest8bpp, Blending) {
+  Test(GetAverageBlendDigest8bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest8bpp, DISABLED_Speed) {
+  Test(GetAverageBlendDigest8bpp(GetParam()),
+       kNumSpeedTests /
+           (kBlockHeightPixels[GetParam()] * kBlockWidthPixels[GetParam()]),
+       false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AverageBlendTest10bpp = AverageBlendTest<10, uint16_t>;
+
+const char* GetAverageBlendDigest10bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
+      "98c0671c092b4288adcaaa17362cc4a3",
+      "7083f3def8bfb63ab3a985ef5616a923",
+      "a7211ee2eaa6f88e08875b377d17b0f1",
+      // 8xN
+      "11f9ab881700f2ef0f82d8d4662868c6",
+      "3bee144b9ea6f4288b860c24f88a22f3",
+      "27113bd17bf95034f100e9046c7b59d2",
+      "c42886a5e16e23a81e43833d34467558",
+      // 16xN
+      "b0ac2eb0a7a6596d6d1339074c7f8771",
+      "24c9e079b9a8647a6ee03f5441f2cdd9",
+      "dd05777751ccdb4356856c90e1176e53",
+      "27b1d69d035b1525c013b7373cfe3875",
+      "08c46403afe19e6b008ccc8f56633da9",
+      // 32xN
+      "36d434db11298aba76166df06e9b8125",
+      "efd24dd7b555786bff1a482e51170ea3",
+      "3b37ddac87de443cd18784f02c2d1dd5",
+      "80d8070939a743a20689a65bf5dc0a68",
+      // 64xN
+      "88e747246237c6408d0bd4cc3ecc8396",
+      "af1fe8c52487c9f2951c3ea516828abb",
+      "ea6f18ff56b053748c18032b7e048e83",
+      "af0cb87fe27d24c2e0afd2c90a8533a6",
+      // 128xN
+      "16a83b19911d6dc7278a694b8baa9901",
+      "bd22e77ce6fa727267ff63eeb4dcb19c",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest10bpp, Blending) {
+  Test(GetAverageBlendDigest10bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest10bpp, DISABLED_Speed) {
+  Test(GetAverageBlendDigest10bpp(GetParam()),
+       kNumSpeedTests /
+           (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) /
+           2,
+       false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AverageBlendTest12bpp = AverageBlendTest<12, uint16_t>;
+
+const char* GetAverageBlendDigest12bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
+      "8f5ad8fba61a0f1cb6b77f5460c241be",
+      "3a9d017848fdb4162315c689b4449ac6",
+      "bb97029fff021b168b98b209dcee5123",
+      // 8xN
+      "a7ff1b199965b8856499ae3f1b2c48eb",
+      "05220c72835fc4662d261183df0a57cf",
+      "97de8c325f1475c44e1afc44183e55ad",
+      "60d820c46cad14d9d934da238bb79707",
+      // 16xN
+      "f3e4863121819bc28f7c1f453898650c",
+      "5f5f68d21269d7df546c848921e8f2cd",
+      "17efe0b0fce1f8d4c7bc6eacf769063e",
+      "3da591e201f44511cdd6c465692ace1e",
+      "5a0ca6c88664d2e918a032b5fcf66070",
+      // 32xN
+      "efe236bee8a9fef90b99d8012006f985",
+      "d6ff3aacbbbadff6d0ccb0873fb9fa2a",
+      "38801f7361052873423d57b574aabddc",
+      "55c76772ecdc1721e92ca04d2fc7c089",
+      // 64xN
+      "4261ecdde34eedc4e5066a93e0f64881",
+      "fe82e012efab872672193316d670fd82",
+      "6c698bc2d4acf4444a64ac55ae9641de",
+      "98626e25101cff69019d1b7e6e439404",
+      // 128xN
+      "fe0f3c89dd39786df1c952a2470d680d",
+      "af7e166fc3d8c9ce85789acf3467ed9d",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest12bpp, Blending) {
+  Test(GetAverageBlendDigest12bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest12bpp, DISABLED_Speed) {
+  Test(GetAverageBlendDigest12bpp(GetParam()),
+       kNumSpeedTests /
+           (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) /
+           2,
+       false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest12bpp,
+                         testing::ValuesIn(kTestParam));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+  return os << ToString(param);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc
new file mode 100644 (file)
index 0000000..9dd9287
--- /dev/null
@@ -0,0 +1,360 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Silence unused function warnings when CdefDirection_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||           \
+    !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||    \
+    (LIBGAV1_MAX_BITDEPTH >= 10 &&                \
+     !defined(LIBGAV1_Dsp10bpp_CdefDirection)) || \
+    (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefDirection))
+constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
+
+int32_t Square(int32_t x) { return x * x; }
+
+template <int bitdepth, typename Pixel>
+void CdefDirection_C(const void* LIBGAV1_RESTRICT const source,
+                     ptrdiff_t stride,
+                     uint8_t* LIBGAV1_RESTRICT const direction,
+                     int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int32_t cost[8] = {};
+  // |partial| does not have to be int32_t for 8bpp. int16_t will suffice. We
+  // use int32_t to keep it simple since |cost| will have to be int32_t.
+  int32_t partial[8][15] = {};
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      const int x = (src[j] >> (bitdepth - 8)) - 128;
+      partial[0][i + j] += x;
+      partial[1][i + j / 2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j / 2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i / 2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i / 2 + j] += x;
+    }
+    src += stride;
+  }
+  for (int i = 0; i < 8; ++i) {
+    cost[2] += Square(partial[2][i]);
+    cost[6] += Square(partial[6][i]);
+  }
+  cost[2] *= kDivisionTable[7];
+  cost[6] *= kDivisionTable[7];
+  for (int i = 0; i < 7; ++i) {
+    cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+               kDivisionTable[i];
+    cost[4] += (Square(partial[4][i]) + Square(partial[4][14 - i])) *
+               kDivisionTable[i];
+  }
+  cost[0] += Square(partial[0][7]) * kDivisionTable[7];
+  cost[4] += Square(partial[4][7]) * kDivisionTable[7];
+  for (int i = 1; i < 8; i += 2) {
+    for (int j = 0; j < 5; ++j) {
+      cost[i] += Square(partial[i][3 + j]);
+    }
+    cost[i] *= kDivisionTable[7];
+    for (int j = 0; j < 3; ++j) {
+      cost[i] += (Square(partial[i][j]) + Square(partial[i][10 - j])) *
+                 kDivisionTable[2 * j + 1];
+    }
+  }
+  int32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        //  !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+        // (LIBGAV1_MAX_BITDEPTH == 12 &&
+        //  !defined(LIBGAV1_Dsp12bpp_CdefDirection))
+
+// Silence unused function warnings when CdefFilter_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||                                       \
+    !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||                                  \
+    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) || \
+    (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefFilters))
+
+int Constrain(int diff, int threshold, int damping) {
+  assert(threshold != 0);
+  damping = std::max(0, damping - FloorLog2(threshold));
+  const int sign = (diff < 0) ? -1 : 1;
+  return sign *
+         Clip3(threshold - (std::abs(diff) >> damping), 0, std::abs(diff));
+}
+
+// Filters the source block. It doesn't check whether the candidate pixel is
+// inside the frame. However it requires the source input to be padded with a
+// constant large value (kCdefLargeValue) if at the boundary.
+template <int block_width, int bitdepth, typename Pixel,
+          bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src,
+                  const ptrdiff_t src_stride, const int block_height,
+                  const int primary_strength, const int secondary_strength,
+                  const int damping, const int direction,
+                  void* LIBGAV1_RESTRICT const dest,
+                  const ptrdiff_t dest_stride) {
+  static_assert(block_width == 4 || block_width == 8, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  assert(block_height == 4 || block_height == 8);
+  assert(direction >= 0 && direction <= 7);
+  constexpr int coeff_shift = bitdepth - 8;
+  // Section 5.9.19. CDEF params syntax.
+  assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift);
+  assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift &&
+         secondary_strength != 3 << coeff_shift);
+  assert(primary_strength != 0 || secondary_strength != 0);
+  // damping is decreased by 1 for chroma.
+  assert((damping >= 3 && damping <= 6 + coeff_shift) ||
+         (damping >= 2 && damping <= 5 + coeff_shift));
+  // When only primary_strength or secondary_strength are non-zero the number
+  // of pixels inspected (4 for primary_strength, 8 for secondary_strength) and
+  // the taps used don't exceed the amount the sum is
+  // descaled by (16) so we can skip tracking and clipping to the minimum and
+  // maximum value observed.
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
+                                                kCdefSecondaryTap1};
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+  int y = block_height;
+  do {
+    int x = 0;
+    do {
+      int16_t sum = 0;
+      const uint16_t pixel_value = src[x];
+      uint16_t max_value = pixel_value;
+      uint16_t min_value = pixel_value;
+      for (int k = 0; k < 2; ++k) {
+        static constexpr int signs[] = {-1, 1};
+        for (const int& sign : signs) {
+          if (enable_primary) {
+            const int dy = sign * kCdefDirections[direction][k][0];
+            const int dx = sign * kCdefDirections[direction][k][1];
+            const uint16_t value = src[dy * src_stride + dx + x];
+            // Note: the summation can ignore the condition check in SIMD
+            // implementation, because Constrain() will return 0 when
+            // value == kCdefLargeValue.
+            if (value != kCdefLargeValue) {
+              sum += Constrain(value - pixel_value, primary_strength, damping) *
+                     kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][k];
+              if (clipping_required) {
+                max_value = std::max(value, max_value);
+                min_value = std::min(value, min_value);
+              }
+            }
+          }
+
+          if (enable_secondary) {
+            static constexpr int offsets[] = {-2, 2};
+            for (const int& offset : offsets) {
+              const int dy = sign * kCdefDirections[direction + offset][k][0];
+              const int dx = sign * kCdefDirections[direction + offset][k][1];
+              const uint16_t value = src[dy * src_stride + dx + x];
+              // Note: the summation can ignore the condition check in SIMD
+              // implementation.
+              if (value != kCdefLargeValue) {
+                sum += Constrain(value - pixel_value, secondary_strength,
+                                 damping) *
+                       kCdefSecondaryTaps[k];
+                if (clipping_required) {
+                  max_value = std::max(value, max_value);
+                  min_value = std::min(value, min_value);
+                }
+              }
+            }
+          }
+        }
+      }
+
+      const int offset = (8 + sum - (sum < 0)) >> 4;
+      if (clipping_required) {
+        dst[x] = static_cast<Pixel>(
+            Clip3(pixel_value + offset, min_value, max_value));
+      } else {
+        dst[x] = static_cast<Pixel>(pixel_value + offset);
+      }
+    } while (++x < block_width);
+
+    src += src_stride;
+    dst += dst_stride;
+  } while (--y != 0);
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        //  !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+        // (LIBGAV1_MAX_BITDEPTH == 12 &&
+        //  !defined(LIBGAV1_Dsp12bpp_CdefFilters))
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void CdefInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h
new file mode 100644 (file)
index 0000000..ce23ea5
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CDEF_H_
+#define LIBGAV1_SRC_DSP_CDEF_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/cdef_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/cdef_avx2.h"
+#include "src/dsp/x86/cdef_sse4.h"
+// clang-format on
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+enum {
+  kCdefSecondaryTap0 = 2,
+  kCdefSecondaryTap1 = 1,
+};
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CDEF_H_
diff --git a/src/dsp/cdef.inc b/src/dsp/cdef.inc
new file mode 100644 (file)
index 0000000..c1a3136
--- /dev/null
@@ -0,0 +1,29 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for cdef implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2;
+
+// Mirror values and pad to 16 elements.
+alignas(16) constexpr uint32_t kCdefDivisionTable[] = {
+    840, 420, 280, 210, 168, 140, 120, 105,
+    120, 140, 168, 210, 280, 420, 840, 0};
+
+// Used when calculating odd |cost[x]| values to mask off unwanted elements.
+// Holds elements 1 3 5 X 5 3 1 X
+alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0,
+                                                          140, 210, 420, 0};
diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc
new file mode 100644 (file)
index 0000000..e2db17a
--- /dev/null
@@ -0,0 +1,453 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kCdefDirectionName[] = "Cdef Direction";
+constexpr char kCdefFilterName[] = "Cdef Filtering";
+constexpr int kTestBufferStride = 8;
+constexpr int kTestBufferSize = 64;
+constexpr int kSourceStride = kMaxSuperBlockSizeInPixels + 2 * 8;
+constexpr int kSourceBufferSize =
+    (kMaxSuperBlockSizeInPixels + 2 * 3) * kSourceStride;
+constexpr int kNumSpeedTests = 5000;
+
+const char* GetDirectionDigest(const int bitdepth, const int num_runs) {
+  static const char* const kDigest[3][2] = {
+      {"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"},
+      {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"},
+      {"5532919a157c4f937da9e822bdb105f7", "dd9dfca6dfca83777d942e693c17627a"}};
+  const int bitdepth_index = (bitdepth - 8) / 2;
+  const int run_index = (num_runs == 1) ? 0 : 1;
+  return kDigest[bitdepth_index][run_index];
+}
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+template <int bitdepth, typename Pixel>
+class CdefDirectionTest : public testing::TestWithParam<int> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  CdefDirectionTest() = default;
+  CdefDirectionTest(const CdefDirectionTest&) = delete;
+  CdefDirectionTest& operator=(const CdefDirectionTest&) = delete;
+  ~CdefDirectionTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    CdefInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cdef_direction_ = nullptr;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      CdefInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+      CdefInit_AVX2();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      CdefInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_cdef_direction_ = dsp->cdef_direction;
+  }
+
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[kTestBufferSize];
+  int strength_;
+  int size_;
+
+  CdefDirectionFunc base_cdef_direction_;
+  CdefDirectionFunc cur_cdef_direction_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefDirectionTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (cur_cdef_direction_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration elapsed_time;
+  libvpx_test::MD5 actual_digest;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int level = 0; level < (1 << bitdepth); level += 1 + (bitdepth - 8)) {
+      for (int bits = 0; bits <= bitdepth; ++bits) {
+        for (auto& pixel : buffer_) {
+          pixel = Clip3((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                        (1 << bitdepth) - 1);
+        }
+        int output[2] = {};
+        const absl::Time start = absl::Now();
+        cur_cdef_direction_(buffer_, kTestBufferStride * sizeof(Pixel),
+                            reinterpret_cast<uint8_t*>(&output[0]), &output[1]);
+        elapsed_time += absl::Now() - start;
+        actual_digest.Add(reinterpret_cast<const uint8_t*>(output),
+                          sizeof(output));
+      }
+    }
+  }
+  test_utils::CheckMd5Digest(kCdef, kCdefDirectionName,
+                             GetDirectionDigest(bitdepth, num_runs),
+                             actual_digest.Get(), elapsed_time);
+}
+
+using CdefDirectionTest8bpp = CdefDirectionTest<8, uint8_t>;
+
+TEST_P(CdefDirectionTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest8bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest8bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefDirectionTest8bpp, testing::Values(0));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefDirectionTest10bpp = CdefDirectionTest<10, uint16_t>;
+
+TEST_P(CdefDirectionTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest10bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest10bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest10bpp, testing::Values(0));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CdefDirectionTest12bpp = CdefDirectionTest<12, uint16_t>;
+
+TEST_P(CdefDirectionTest12bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest12bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest12bpp, testing::Values(0));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33",
+      "82653dd66072e8ebd967083a0413ab03", "421c048396bc66ffaa6aafa016c7bc54",
+      "1f70ba51091e8c6034c3f0974af241c3", "8f700997452a24091136ca58890a5be4",
+      "9e3dea21ee4246172121f0420eccd899", "0848bdeffa74145758ef47992e1035c4",
+      "0bb55818de986e9d988b0c1cc6883887", "9b558a7eefc934f90cd09ca26b998bfd",
+      "3a38670f8c5f0c61cc47c9c79da728d2", "ed18fe91180e78008ccb98e9019bed69",
+      "2aa4bbcb6fb088ad42bde76be014dff0", "88f746f0d6c079ab8e9ecc7ff67524c7",
+      "7cffa948f5ddbccc7c6b07d15ca9eb69", "5e22c1c89735965dda935d1249129548",
+      "e765133d133b94e1578c8c5616248a96", "da95d47cad74eb4a075893ca98e658ab",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "0a9630b39974850998db653b07e09ab4", "97a924661d931b23ee57893da617ae70",
+      "0d79516b9a491ce5112eb00bbae5eb80", "d5801fd96029a7509cf66dde61e8e2d8",
+      "5bf5c0ea5a85e9b6c1e6991619c34ebc", "e2f1c08a8b3cd93b3a85511493a0ee31",
+      "45c047d2be5e2dcf6094937780a3f88a", "346caf437c1ad85862de72a622e29845",
+      "0e9cb69d24d9badbe956da779d912b05", "81803dcb00971237b3fe6372564a842f",
+      "17681ad2ed4a2456d70760852af6c6fd", "5312f8049a08a5f9b1708fda936f7a55",
+      "3f0f522f3a33e4ff2a97bdc1e614c5c4", "3818a50be7fe16aa0c636a7392d1eceb",
+      "c6849b8cd77a076dc7e3c26e8cd55b9e", "223c0dd685bbc74aec1d088356708433",
+      "90992957cb8103222aa2fb43c6cd2fc4", "a4ba6edcefe4130851c4c2607b147f95",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+  static const char* const kDigest[] = {
+      "a32569989c42fd4254979f70c1c65f5a", "dc389048217633e2dd64126376be7d25",
+      "3b0e8dae294895330f349863b1773c39", "9741fe8d27d109cb99b7a9cdc030f52a",
+      "ab70f3729b52287c6432ba7624280a68", "c1e5cf39cbc8030b82e09633c6c67d42",
+      "d5120a196164ff5a0ad7aa8c02e9b064", "1133759f3aee3a362a0ab668f6faf843",
+      "feb0ab7f515665f79fce213e8cd2fb10", "e86ea55c2d6d5cc69716535bd455c99f",
+      "e463da1b9d089b6ee82c041794257fd7", "27800e4af0cceeaf0a95c96275a7befe",
+      "f42e426481db00582b327eb2971bca96", "6127ff289833dde0270000d8240f36b7",
+      "cc5dbaf70e2fef7729a8e2ea9937fbcf", "51850b4e3e2a3919e110376fcb6318d3",
+      "d5ac7ac25eb1b5aee293b2a2ec9de775", "64ecc00b2e24a2f07df833fb50ce09c3",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct CdefTestParam {
+  CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4,
+                int columns4x4)
+      : subsampling_x(subsampling_x),
+        subsampling_y(subsampling_y),
+        rows4x4(rows4x4),
+        columns4x4(columns4x4) {}
+  int subsampling_x;
+  int subsampling_y;
+  int rows4x4;
+  int columns4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) {
+  return os << "subsampling(x/y): " << param.subsampling_x << "/"
+            << param.subsampling_y << ", (rows,columns)4x4: " << param.rows4x4
+            << ", " << param.columns4x4;
+}
+
+// TODO(b/154245961): rework the parameters for this test to match
+// CdefFilteringFuncs. It should cover 4x4, 8x4, 8x8 blocks and
+// primary/secondary strength combinations for both Y and UV.
+template <int bitdepth, typename Pixel>
+class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  CdefFilteringTest() = default;
+  CdefFilteringTest(const CdefFilteringTest&) = delete;
+  CdefFilteringTest& operator=(const CdefFilteringTest&) = delete;
+  ~CdefFilteringTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    CdefInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      CdefInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      CdefInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+      CdefInit_AVX2();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    memcpy(cur_cdef_filter_, dsp->cdef_filters, sizeof(cur_cdef_filter_));
+  }
+
+  void TestRandomValues(int num_runs);
+
+  uint16_t source_[kSourceBufferSize];
+  Pixel dest_[kMaxPlanes][kTestBufferSize];
+  int primary_strength_;
+  int secondary_strength_;
+  int damping_;
+  int direction_;
+  CdefTestParam param_ = GetParam();
+
+  CdefFilteringFuncs cur_cdef_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  const int id = static_cast<int>(param_.rows4x4 < 4) * 3 +
+                 (param_.subsampling_x + param_.subsampling_y) * 6;
+  absl::Duration elapsed_time[kMaxPlanes];
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      const int subsampling_x = (plane == kPlaneY) ? 0 : param_.subsampling_x;
+      const int subsampling_y = (plane == kPlaneY) ? 0 : param_.subsampling_y;
+      const int block_width = 8 >> subsampling_x;
+      const int block_height = 8 >> subsampling_y;
+      libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                                 id + plane);
+      const int offset = 2 * kSourceStride + 2;
+      // Fill boundaries with a large value such that cdef does not take them
+      // into calculation.
+      const int plane_width = MultiplyBy4(param_.columns4x4) >> subsampling_x;
+      const int plane_height = MultiplyBy4(param_.rows4x4) >> subsampling_y;
+      for (int y = 0; y < plane_height; ++y) {
+        for (int x = 0; x < plane_width; ++x) {
+          source_[y * kSourceStride + x + offset] =
+              rnd.Rand16() & ((1 << bitdepth) - 1);
+        }
+      }
+      for (int y = 0; y < 2; ++y) {
+        Memset(&source_[y * kSourceStride], kCdefLargeValue, kSourceStride);
+        Memset(&source_[(y + plane_height + 2) * kSourceStride],
+               kCdefLargeValue, kSourceStride);
+      }
+      for (int y = 0; y < plane_height; ++y) {
+        Memset(&source_[y * kSourceStride + offset - 2], kCdefLargeValue, 2);
+        Memset(&source_[y * kSourceStride + offset + plane_width],
+               kCdefLargeValue, 2);
+      }
+      do {
+        int strength = rnd.Rand16() & 15;
+        if (strength == 3) ++strength;
+        primary_strength_ = strength << (bitdepth - 8);
+      } while (primary_strength_ == 0);
+      do {
+        int strength = rnd.Rand16() & 3;
+        if (strength == 3) ++strength;
+        secondary_strength_ = strength << (bitdepth - 8);
+      } while (secondary_strength_ == 0);
+      damping_ = (rnd.Rand16() & 3) + 3;
+      direction_ = (rnd.Rand16() & 7);
+
+      memset(dest_[plane], 0, sizeof(dest_[plane]));
+      const absl::Time start = absl::Now();
+      const int width_index = block_width >> 3;
+      if (cur_cdef_filter_[width_index][0] == nullptr) return;
+      cur_cdef_filter_[width_index][0](
+          source_ + offset, kSourceStride, block_height, primary_strength_,
+          secondary_strength_, damping_, direction_, dest_[plane],
+          kTestBufferStride * sizeof(dest_[0][0]));
+      elapsed_time[plane] += absl::Now() - start;
+    }
+  }
+
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    const char* expected_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        expected_digest = GetDigest8bpp(id + plane);
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        expected_digest = GetDigest10bpp(id + plane);
+        break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+      case 12:
+        expected_digest = GetDigest12bpp(id + plane);
+        break;
+#endif
+    }
+    ASSERT_NE(expected_digest, nullptr);
+    test_utils::CheckMd5Digest(kCdef, kCdefFilterName, expected_digest,
+                               reinterpret_cast<uint8_t*>(dest_[plane]),
+                               sizeof(dest_[plane]), elapsed_time[plane]);
+  }
+}
+
+// Do not test single blocks with any subsampling. 2xH and Wx2 blocks are not
+// supported.
+const CdefTestParam cdef_test_param[] = {
+    CdefTestParam(0, 0, 4, 4), CdefTestParam(0, 0, 2, 2),
+    CdefTestParam(1, 0, 4, 4), CdefTestParam(1, 0, 2, 2),
+    CdefTestParam(1, 1, 4, 4), CdefTestParam(1, 1, 2, 2),
+};
+
+using CdefFilteringTest8bpp = CdefFilteringTest<8, uint8_t>;
+
+TEST_P(CdefFilteringTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest8bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefFilteringTest10bpp = CdefFilteringTest<10, uint16_t>;
+
+TEST_P(CdefFilteringTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest10bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest10bpp,
+                         testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest10bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CdefFilteringTest12bpp = CdefFilteringTest<12, uint16_t>;
+
+TEST_P(CdefFilteringTest12bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest12bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest12bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/common.h b/src/dsp/common.h
new file mode 100644 (file)
index 0000000..d614a81
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_COMMON_H_
+#define LIBGAV1_SRC_DSP_COMMON_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum { kSgrStride = kRestorationUnitWidth + 32 };  // anonymous enum
+
+// Self guided projection filter.
+struct SgrProjInfo {
+  int index;
+  int multiplier[2];
+};
+
+struct WienerInfo {
+  static const int kVertical = 0;
+  static const int kHorizontal = 1;
+  int16_t number_leading_zero_coefficients[2];
+  alignas(kMaxAlignment) int16_t filter[2][(kWienerFilterTaps + 1) / 2];
+};
+
+struct RestorationUnitInfo : public MaxAlignedAllocable {
+  LoopRestorationType type;
+  SgrProjInfo sgr_proj_info;
+  WienerInfo wiener_info;
+};
+
+struct SgrBuffer {
+  alignas(kMaxAlignment) uint16_t sum3[4 * kSgrStride];
+  alignas(kMaxAlignment) uint16_t sum5[5 * kSgrStride];
+  alignas(kMaxAlignment) uint32_t square_sum3[4 * kSgrStride];
+  alignas(kMaxAlignment) uint32_t square_sum5[5 * kSgrStride];
+  alignas(kMaxAlignment) uint16_t ma343[4 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint16_t ma444[3 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint16_t ma565[2 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth];
+  // The following 2 buffers are only used by the C functions. Since SgrBuffer
+  // is smaller than |wiener_buffer| in RestorationBuffer which is an union,
+  // it's OK to always keep the following 2 buffers.
+  alignas(kMaxAlignment) uint8_t ma[kSgrStride];  // [0, 255]
+  // b is less than 2^16 for 8-bit. However, making it a template slows down the
+  // C function by 5%. So b is fixed to 32-bit.
+  alignas(kMaxAlignment) uint32_t b[kSgrStride];
+};
+
+union RestorationBuffer {
+  // For self-guided filter.
+  SgrBuffer sgr_buffer;
+  // For wiener filter.
+  // The array |intermediate| in Section 7.17.4, the intermediate results
+  // between the horizontal and vertical filters.
+  alignas(kMaxAlignment) int16_t
+      wiener_buffer[(kRestorationUnitHeight + kWienerFilterTaps - 1) *
+                    kRestorationUnitWidth];
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_COMMON_H_
diff --git a/src/dsp/common_dsp_test.cc b/src/dsp/common_dsp_test.cc
new file mode 100644 (file)
index 0000000..3342ce8
--- /dev/null
@@ -0,0 +1,58 @@
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/match.h"
+#include "gtest/gtest.h"
+#include "src/dsp/x86/common_avx2_test.h"
+#include "src/dsp/x86/common_sse4_test.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+class CommonDspTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->name();
+    if (absl::StartsWith(test_case, "SSE41")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+    } else if (absl::StartsWith(test_case, "AVX2")) {
+      if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+  }
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CommonDspTest);
+
+#if LIBGAV1_ENABLE_AVX2
+TEST_F(CommonDspTest, AVX2RightShiftWithRoundingS16) {
+  AVX2RightShiftWithRoundingS16Test();
+}
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_ENABLE_SSE4_1
+TEST_F(CommonDspTest, SSE41RightShiftWithRoundingS16) {
+  SSE41RightShiftWithRoundingS16Test();
+}
+#endif  // LIBGAV1_ENABLE_SSE41
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc
new file mode 100644 (file)
index 0000000..1b85795
--- /dev/null
@@ -0,0 +1,103 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/constants.h"
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// Each set of 7 taps is padded with a 0 to easily align and pack into the high
+// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
+alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+    {{-6, 10, 0, 0, 0, 12, 0, 0},
+     {-5, 2, 10, 0, 0, 9, 0, 0},
+     {-3, 1, 1, 10, 0, 7, 0, 0},
+     {-3, 1, 1, 2, 10, 5, 0, 0},
+     {-4, 6, 0, 0, 0, 2, 12, 0},
+     {-3, 2, 6, 0, 0, 2, 9, 0},
+     {-3, 2, 2, 6, 0, 2, 7, 0},
+     {-3, 1, 2, 2, 6, 3, 5, 0}},
+    {{-10, 16, 0, 0, 0, 10, 0, 0},
+     {-6, 0, 16, 0, 0, 6, 0, 0},
+     {-4, 0, 0, 16, 0, 4, 0, 0},
+     {-2, 0, 0, 0, 16, 2, 0, 0},
+     {-10, 16, 0, 0, 0, 0, 10, 0},
+     {-6, 0, 16, 0, 0, 0, 6, 0},
+     {-4, 0, 0, 16, 0, 0, 4, 0},
+     {-2, 0, 0, 0, 16, 0, 2, 0}},
+    {{-8, 8, 0, 0, 0, 16, 0, 0},
+     {-8, 0, 8, 0, 0, 16, 0, 0},
+     {-8, 0, 0, 8, 0, 16, 0, 0},
+     {-8, 0, 0, 0, 8, 16, 0, 0},
+     {-4, 4, 0, 0, 0, 0, 16, 0},
+     {-4, 0, 4, 0, 0, 0, 16, 0},
+     {-4, 0, 0, 4, 0, 0, 16, 0},
+     {-4, 0, 0, 0, 4, 0, 16, 0}},
+    {{-2, 8, 0, 0, 0, 10, 0, 0},
+     {-1, 3, 8, 0, 0, 6, 0, 0},
+     {-1, 2, 3, 8, 0, 4, 0, 0},
+     {0, 1, 2, 3, 8, 2, 0, 0},
+     {-1, 4, 0, 0, 0, 3, 10, 0},
+     {-1, 3, 4, 0, 0, 4, 6, 0},
+     {-1, 2, 3, 4, 0, 4, 4, 0},
+     {-1, 2, 2, 3, 4, 3, 3, 0}},
+    {{-12, 14, 0, 0, 0, 14, 0, 0},
+     {-10, 0, 14, 0, 0, 12, 0, 0},
+     {-9, 0, 0, 14, 0, 11, 0, 0},
+     {-8, 0, 0, 0, 14, 10, 0, 0},
+     {-10, 12, 0, 0, 0, 0, 14, 0},
+     {-9, 1, 12, 0, 0, 0, 12, 0},
+     {-8, 0, 0, 12, 0, 1, 11, 0},
+     {-7, 0, 0, 1, 12, 1, 9, 0}}};
+
+// A lookup table replacing the calculation of the variable s in Section 7.17.3
+// (Box filter process). The first index is sgr_proj_index (the lr_sgr_set
+// syntax element in the Spec, saved in the sgr_proj_info.index field of a
+// RestorationUnitInfo struct). The second index is pass (0 or 1).
+//
+// const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
+// const uint32_t n2_with_scale = n * n * scale;
+// const uint32_t s =
+// ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
+// 0 is an invalid value, corresponding to radius = 0, where the filter is
+// skipped.
+const uint16_t kSgrScaleParameter[16][2] = {
+    {140, 3236}, {112, 2158}, {93, 1618}, {80, 1438}, {70, 1295}, {58, 1177},
+    {47, 1079},  {37, 996},   {30, 925},  {25, 863},  {0, 2589},  {0, 1618},
+    {0, 1177},   {0, 925},    {56, 0},    {22, 0},
+};
+
+const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}};
+
+// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+// beginning and end of the table. The cdef direction range is [0, 7] and the
+// first index is offset +/-2. This removes the need to constrain the first
+// index to the same range using e.g., & 7.
+const int8_t kCdefDirectionsPadded[12][2][2] = {
+    {{1, 0}, {2, 0}},    // Padding: Cdef_Directions[6]
+    {{1, 0}, {2, -1}},   // Padding: Cdef_Directions[7]
+    {{-1, 1}, {-2, 2}},  // Begin Cdef_Directions
+    {{0, 1}, {-1, 2}},   //
+    {{0, 1}, {0, 2}},    //
+    {{0, 1}, {1, 2}},    //
+    {{1, 1}, {2, 2}},    //
+    {{1, 0}, {2, 1}},    //
+    {{1, 0}, {2, 0}},    //
+    {{1, 0}, {2, -1}},   // End Cdef_Directions
+    {{-1, 1}, {-2, 2}},  // Padding: Cdef_Directions[0]
+    {{0, 1}, {-1, 2}},   // Padding: Cdef_Directions[1]
+};
+
+}  // namespace libgav1
diff --git a/src/dsp/constants.h b/src/dsp/constants.h
new file mode 100644 (file)
index 0000000..dd0a4e0
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONSTANTS_H_
+#define LIBGAV1_SRC_DSP_CONSTANTS_H_
+
+// This file contains DSP related constants that have a direct relationship with
+// a DSP component.
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+enum {
+  kCflLumaBufferStride = 32,
+};  // anonymous enum
+
+extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8];
+
+// Values in this enum can be derived as the sum of subsampling_x and
+// subsampling_y (since subsampling_x == 0 && subsampling_y == 1 case is never
+// allowed by the bitstream).
+enum SubsamplingType : uint8_t {
+  kSubsamplingType444,  // subsampling_x = 0, subsampling_y = 0.
+  kSubsamplingType422,  // subsampling_x = 1, subsampling_y = 0.
+  kSubsamplingType420,  // subsampling_x = 1, subsampling_y = 1.
+  kNumSubsamplingTypes
+};
+
+extern const uint16_t kSgrScaleParameter[16][2];
+
+extern const uint8_t kCdefPrimaryTaps[2][2];
+
+extern const int8_t kCdefDirectionsPadded[12][2][2];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CONSTANTS_H_
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
new file mode 100644 (file)
index 0000000..6989da0
--- /dev/null
@@ -0,0 +1,968 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+constexpr int kVerticalOffset = 3;
+
+// Compound prediction output ranges from ConvolveTest.ShowRange.
+// In some cases, the horizontal or vertical filter will be omitted. This table
+// shows the general case, where the downscaled horizontal output is input to
+// the vertical filter via the |intermediate_result| array. The final output is
+// either Pixel or compound values, depending on the |is_compound| variable.
+// Bitdepth:  8 Input range:            [       0,      255]
+//   Horizontal upscaled range:         [   -7140,    23460]
+//   Horizontal downscaled range:       [   -1785,     5865]
+//   Vertical upscaled range:           [ -328440,   589560]
+//   Pixel output range:                [       0,      255]
+//   Compound output range:             [   -5132,     9212]
+//
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   Horizontal upscaled range:         [  -28644,    94116]
+//   Horizontal downscaled range:       [   -7161,    23529]
+//   Vertical upscaled range:           [-1317624,  2365176]
+//   Pixel output range:                [       0,     1023]
+//   Compound output range:             [    3988,    61532]
+//
+// Bitdepth: 12 Input range:            [       0,     4095]
+//   Horizontal upscaled range:         [ -114660,   376740]
+//   Horizontal downscaled range:       [   -7166,    23546]
+//   Vertical upscaled range:           [-1318560,  2366880]
+//   Pixel output range:                [       0,     4095]
+//   Compound output range:             [    3974,    61559]
+
+template <int bitdepth, typename Pixel>
+void ConvolveScale2D_C(const void* LIBGAV1_RESTRICT const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index, const int subpixel_x,
+                       const int subpixel_y, const int step_x, const int step_y,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT prediction,
+                       const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      kSubPixelTaps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (2 * kMaxSuperBlockSizeInPixels + 8)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+  const int max_pixel_value = (1 << bitdepth) - 1;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  // Note: assume the input src is already aligned to the correct start
+  // position.
+  int y = 0;
+  do {
+    int p = subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      p += step_x;
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  int p = subpixel_y & 1023;
+  y = 0;
+  do {
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum +=
+            kHalfSubPixelFilters[filter_index][filter_id][k] *
+            intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+                         x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    dest += dest_stride;
+    p += step_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundScale2D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int subpixel_x, const int subpixel_y,
+    const int step_x, const int step_y, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      kSubPixelTaps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (2 * kMaxSuperBlockSizeInPixels + 8)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  // Note: assume the input src is already aligned to the correct start
+  // position.
+  int y = 0;
+  do {
+    int p = subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      p += step_x;
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  int p = subpixel_y & 1023;
+  y = 0;
+  do {
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum +=
+            kHalfSubPixelFilters[filter_index][filter_id][k] *
+            intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+                         x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    dest += pred_stride;
+    p += step_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompound2D_C(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index,
+                          const int horizontal_filter_id,
+                          const int vertical_filter_id, const int width,
+                          const int height, void* LIBGAV1_RESTRICT prediction,
+                          const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+  const int intermediate_height = height + kSubPixelTaps - 1;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src = static_cast<const Pixel*>(reference) -
+                    kVerticalOffset * src_stride - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               intermediate[k * intermediate_stride + x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    dest += pred_stride;
+    intermediate += intermediate_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is single prediction mode, where both horizontal and
+// vertical filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void Convolve2D_C(const void* LIBGAV1_RESTRICT const reference,
+                  const ptrdiff_t reference_stride,
+                  const int horizontal_filter_index,
+                  const int vertical_filter_index,
+                  const int horizontal_filter_id, const int vertical_filter_id,
+                  const int width, const int height,
+                  void* LIBGAV1_RESTRICT prediction,
+                  const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  const int intermediate_height = height + kSubPixelTaps - 1;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+  const int max_pixel_value = (1 << bitdepth) - 1;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src = static_cast<const Pixel*>(reference) -
+                    kVerticalOffset * src_stride - kHorizontalOffset;
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               intermediate[k * intermediate_stride + x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    dest += dest_stride;
+    intermediate += intermediate_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only horizontal
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveHorizontal_C(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int /*vertical_filter_index*/,
+                          const int horizontal_filter_id,
+                          const int /*vertical_filter_id*/, const int width,
+                          const int height, void* LIBGAV1_RESTRICT prediction,
+                          const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int bits = kFilterBits - kRoundBitsHorizontal;
+  const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int max_pixel_value = (1 << bitdepth) - 1;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveVertical_C(const void* LIBGAV1_RESTRICT const reference,
+                        const ptrdiff_t reference_stride,
+                        const int /*horizontal_filter_index*/,
+                        const int vertical_filter_index,
+                        const int /*horizontal_filter_id*/,
+                        const int vertical_filter_id, const int width,
+                        const int height, void* LIBGAV1_RESTRICT prediction,
+                        const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src =
+      static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  // Copy filters must call ConvolveCopy().
+  assert(vertical_filter_id != 0);
+
+  const int max_pixel_value = (1 << bitdepth) - 1;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               src[k * src_stride + x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCopy_C(const void* LIBGAV1_RESTRICT const reference,
+                    const ptrdiff_t reference_stride,
+                    const int /*horizontal_filter_index*/,
+                    const int /*vertical_filter_index*/,
+                    const int /*horizontal_filter_id*/,
+                    const int /*vertical_filter_id*/, const int width,
+                    const int height, void* LIBGAV1_RESTRICT prediction,
+                    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  int y = 0;
+  do {
+    memcpy(dest, src, width * sizeof(Pixel));
+    src += reference_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundCopy_C(const void* LIBGAV1_RESTRICT const reference,
+                            const ptrdiff_t reference_stride,
+                            const int /*horizontal_filter_index*/,
+                            const int /*vertical_filter_index*/,
+                            const int /*horizontal_filter_id*/,
+                            const int /*vertical_filter_id*/, const int width,
+                            const int height, void* LIBGAV1_RESTRICT prediction,
+                            const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsVertical =
+      ((bitdepth == 12) ? kInterRoundBitsVertical12bpp
+                        : kInterRoundBitsVertical) -
+      kInterRoundBitsCompoundVertical;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = (bitdepth == 8) ? 0 : ((1 << bitdepth) + (1 << (bitdepth - 1)));
+      sum += src[x];
+      dest[x] = sum << kRoundBitsVertical;
+    } while (++x < width);
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only horizontal
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundHorizontal_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  // Copy filters must call ConvolveCopy().
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only vertical
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundVertical_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src =
+      static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  // Copy filters must call ConvolveCopy().
+  assert(vertical_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               src[k * src_stride + x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from current frame and both horizontal and vertical
+// filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveIntraBlockCopy2D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int intermediate_height = height + 1;
+  uint16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                               (kMaxSuperBlockSizeInPixels + 1)];
+  uint16_t* intermediate = intermediate_result;
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      intermediate[x] = src[x] + src[x + 1];
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += width;
+  } while (++y < intermediate_height);
+
+  intermediate = intermediate_result;
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      dest[x] =
+          RightShiftWithRounding(intermediate[x] + intermediate[x + width], 2);
+    } while (++x < width);
+
+    intermediate += width;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from the current frame and only horizontal or vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+// The filtering of intra block copy is simply the average of current and
+// the next pixel.
+template <int bitdepth, typename Pixel, bool is_horizontal>
+void ConvolveIntraBlockCopy1D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const ptrdiff_t offset = is_horizontal ? 1 : src_stride;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      dest[x] = RightShiftWithRounding(src[x] + src[x + offset], 1);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlock2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlock2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void ConvolveInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/convolve.h b/src/dsp/convolve.h
new file mode 100644 (file)
index 0000000..8780bfc
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_
+#define LIBGAV1_SRC_DSP_CONVOLVE_H_
+
+#include <cassert>
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/convolve_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/convolve_avx2.h"
+#include "src/dsp/x86/convolve_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve and Dsp::convolve_scale. This function is not
+// thread-safe.
+void ConvolveInit_C();
+
+inline int GetNumTapsInFilter(const int filter_index) {
+  if (filter_index < 2) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CONVOLVE_H_
diff --git a/src/dsp/convolve.inc b/src/dsp/convolve.inc
new file mode 100644 (file)
index 0000000..2e0b270
--- /dev/null
@@ -0,0 +1,22 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+constexpr int kIntermediateAllocWidth = kMaxSuperBlockSizeInPixels;
+constexpr int kIntermediateStride = 8;
+constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc
new file mode 100644 (file)
index 0000000..b8c1f1d
--- /dev/null
@@ -0,0 +1,1545 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The convolve function will access at most (block_height + 7) rows/columns
+// from the beginning.
+constexpr int kMaxBlockWidth = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+constexpr int kMaxBlockHeight = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+
+// Test all the filters in |kSubPixelFilters|. There are 6 different filters but
+// filters [4] and [5] are only reached through GetFilterIndex().
+constexpr int kMinimumViableRuns = 4 * 16;
+
+struct ConvolveTestParam {
+  enum BlockSize {
+    kBlockSize2x2,
+    kBlockSize2x4,
+    kBlockSize4x2,
+    kBlockSize4x4,
+    kBlockSize4x8,
+    kBlockSize8x2,
+    kBlockSize8x4,
+    kBlockSize8x8,
+    kBlockSize8x16,
+    kBlockSize16x8,
+    kBlockSize16x16,
+    kBlockSize16x32,
+    kBlockSize32x16,
+    kBlockSize32x32,
+    kBlockSize32x64,
+    kBlockSize64x32,
+    kBlockSize64x64,
+    kBlockSize64x128,
+    kBlockSize128x64,
+    kBlockSize128x128,
+    kNumBlockSizes
+  };
+
+  static constexpr int kBlockWidth[kNumBlockSizes] = {
+      2, 2, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64, 128, 128};
+  static constexpr int kBlockHeight[kNumBlockSizes] = {
+      2, 4, 2, 4, 8, 2, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, 128, 64, 128};
+
+  explicit ConvolveTestParam(BlockSize block_size)
+      : block_size(block_size),
+        width(kBlockWidth[block_size]),
+        height(kBlockHeight[block_size]) {}
+
+  BlockSize block_size;
+  int width;
+  int height;
+};
+
+#if !LIBGAV1_CXX17
+constexpr int ConvolveTestParam::kBlockWidth[kNumBlockSizes];   // static.
+constexpr int ConvolveTestParam::kBlockHeight[kNumBlockSizes];  // static.
+#endif
+
+const char* GetConvolveDigest8bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+      "ae5977a4ceffbac0cde72a04a43a9d57", "6cf5f791fe0d8dcd3526be3c6b814035",
+      "d905dfcad930aded7718587c05b48aaf", "6baf153feff04cc5b7e87c0bb60a905d",
+      "871ed5a69ca31e6444faa720895949bf", "c9cf1deba08dac5972b3b0a43eff8f98",
+      "68e2f90eaa0ab5da7e6f5776993f7eea", "f1f8282fb33c30eb68c0c315b7a4bc01",
+      "9412064b0eebf8123f23d74147d04dff", "cc08936effe309ab9a4fa1bf7e28e24e",
+      "36cbef36fa21b98df03536c918bf752a", "9d0da6321cf5311ea0bdd41271763030",
+      "55a10165ee8a660d7dddacf7de558cdd", "ac7fc9f9ea7213743fae5a023faaaf08",
+      "077e1b7b355c7ab3ca40230ee8efd8ea", "7a3e8de2a1caae206cf3e51a86dfd15a",
+      "1ddf9020f18fa7883355cf8c0881186a", "2377dd167ef2707978bed6f10ffd4e76",
+      "f918e0e4422967c6a7e47298135c7ae9", "b2264e129636368b5496760b39e64b7a",
+      "1168251e6261e2ff1fa69a93226dbd76", "4821befdf63f8c6da6440afeb57f320f",
+      "c30fc44d83821141e84cc4793e127301", "a8293b933d9f2e5d7f922ea40111d643",
+      "354a54861a94e8b027afd9931e61f997", "b384e9e3d81f9f4f9024028fbe451d8b",
+      "eeeb8589c1b31cbb565154736ca939ec", "f49dab626ddd977ed171f79295c24935",
+      "78d2f27e0d4708cb16856d7d40dc16fb", "9d2393ea156a1c2083f5b4207793064b",
+      "a9c62745b95c66fa497a524886af57e2", "2c614ec4463386ec075a0f1dbb587933",
+      "7a8856480d752153370240b066b90f6a", "beaef1dbffadc701fccb7c18a03e3a41",
+      "72b1e700c949d06eaf62d664dafdb5b6", "684f5c3a25a080edaf79add6e9137a8e",
+      "3be970f49e4288988818b087201d54da", "d2b9dba2968894a414756bb510ac389a",
+      "9a3215eb97aedbbddd76c7440837d040", "4e317feac6da46addf0e8b9d8d54304b",
+      "d2f5ca2b7958c332a3fb771f66da01f0", "7aec92c3b65e456b64ae285c12b03b0d",
+      "f72a99ad63f6a88c23724e898b705d21", "07a1f07f114c4a38ba08d2f44e1e1132",
+      "26b9de95edb45b31ac5aa19825831c7a", "4e4677a0623d44237eb8d6a622cdc526",
+      "c1b836a6ce023663b90db0e320389414", "5befcf222152ebc8d779fcc10b95320a",
+      "62adf407fc27d8682ced4dd7b55af14e", "35be0786a072bf2f1286989261bf6580",
+      "90562fc42dc5d879ae74c4909c1dec30", "a1427352f9e413975a0949e2b300c657",
+      "bcbc418bc2beb243e463851cd95335a9", "cb8fedcbecee3947358dc61f95e56530",
+      "0d0154a7d573685285a83a4cf201ac57", "b14bd8068f108905682b83cc15778065",
+      "c96c867d998473197dde9b587be14e3a", "f596c63c7b14cada0174e17124c83942",
+      "eb2822ad8204ed4ecbf0f30fcb210498", "538ce869ffd23b6963e61badfab7712b",
+      "6bbcc075f8b768a02cdc9149f150326d", "4ae70d9db2ec36885394db7d59bdd4f7",
+      "5fee162fe52c11c823db4d5ede370654", "9365186c59ef66d9def40f437022ad93",
+      "0f95fb0276c9c7910937fbdf75f2811d", "356d4003477283e157c8d2b5a79d913c",
+      "b355dab2dbb6f5869018563eece22862", "cf6ff8c43d8059cea6090a23ab66a0ef",
+      "a336f8b7bcf188840ca65c0d0e66518a", "de953f03895923359c6a719e6a537b89",
+      "8463ade9347ed602663e2cec5c4c3fe6", "392de11ffcd5c2ecf3db3480ee135340",
+      "bddd31e3e852712e6244b616622af83d", "30a36245c40d978fc8976b442a8600c3",
+      "93aa662b988b8502e5ea95659eafde59", "70440ba9ee7f9d16d297dbb49e54a56e",
+      "1eb2be4c05b50e427e29c72fa566bff5", "52c0980bae63e8459e82eee7d8af2334",
+      "75e57104d6058cd2bce1d3d8142d273d", "b4c735269ade44419169adbd852d5ddc",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a7305087fae23de53d21a6909009ff69",
+      "8dcce009395264379c1a51239f4bb22c", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "8dcce009395264379c1a51239f4bb22c", "d90a69e7bae8aa46ed0e1e5f911d7a07",
+      "6ab4dc87be03be1dcc5d956ca819d938", "6ab4dc87be03be1dcc5d956ca819d938",
+      "8f2afdb2f03cd04ffacd421b958caaa0", "710ccecc103033088d898a2b924551fb",
+      "710ccecc103033088d898a2b924551fb", "a4093e3e5902dd659407ce6471635a4e",
+      "375d7f5358d7a088a498b8b3aaecc0d5", "375d7f5358d7a088a498b8b3aaecc0d5",
+      "08867ea5cc38c705ec52af821bc4736a", "2afb540e8063f58d1b03896486c5e89b",
+      "2afb540e8063f58d1b03896486c5e89b", "6ce47b11d2e60c5d183c84ce9f2e46cc",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a5a1ac658d7ce4a846a32b9fcfaa3475",
+      "2370f4e4a83edf91b7f504bbe4b00e90", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "ae5464066a049622a7a264cdf9394b55", "45368b6db3d1fee739a64b0bc823ea9c",
+      "8dff0f28192d9f8c0bf7fb5405719dd8", "632738ef3ff3021cff45045c41978849",
+      "f7ec43384037e8d6c618e0df826ec029", "a6bc648197781a2dc99c487e66464320",
+      "1112ebd509007154c72c5a485b220b62", "9714c4ce636b6fb0ad05cba246d48c76",
+      "2c93dde8884f09fb5bb5ad6d95cde86d", "a49e6160b5d1b56bc2046963101cd606",
+      "7f084953976111e9f65b57876e7552b1", "0846ec82555b66197c5c45b08240fbcc",
+      "ca7471c126ccd22189e874f0a6e41960", "0802b6318fbd0969a33de8fdfcd07f10",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "3b1ceebf0579fcbbfd6136938c595b91",
+      "ecafabcad1045f15d31ce2f3b13132f2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "5f211eba020e256a5781b203c5aa1d2e", "3b04497634364dd2cd3f2482b5d4b32f",
+      "a8ac7b5dc65ffb758b0643508a0e744e", "561ed8be43c221a561f8885a0d74c7ef",
+      "8159619fc234598c8c75154d80021fd4", "8f43645dce92cf7594aa4822aa53b17d",
+      "b6ccddb7dfa4eddc87b4eff08b5a3195", "b4e605327b28db573d88844a1a09db8d",
+      "15b00a15d1cc6cc96ca85d00b167e4dd", "7bf911888c11a9fefd604b8b9c82e9a1",
+      "bfb69b4d7d4aed73cfa75a0f55b66440", "034d1d62581bd0d840c4cf1e28227931",
+      "8cba849640e9e2859d509bc81ca94acd", "bc79acf2a0fe419194cdb4529bc7dcc8",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "3bfad931bce82335219e0e29c15f2b21",
+      "68a701313d2247d2b32636ebc1f2a008", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "05afe1f40d37a45a97a5e0aadd5066fb", "9e1f0e0bddb58d15d0925eeaede9b84c",
+      "03313cdaa593a1a7b4869010dcc7b241", "88a50d2b4107ee5b5074b2520183f8ac",
+      "ac50ea9f7306da95a5092709442989cf", "739b17591437edffd36799237b962658",
+      "b8a7eb7dd9c216e240517edfc6489397", "75b755f199dbf4a0e5ebbb86c2bd871d",
+      "31b0017ba1110e3d70b020901bc15564", "0a1aa8f5ecfd11ddba080af0051c576a",
+      "536181ee90de883cc383787aec089221", "29f82b0f3e4113944bd28aacd9b8489a",
+      "ee3e76371240d1f1ff811cea6a7d4f63", "17a20dbbf09feae557d40aa5818fbe76",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "6baf153feff04cc5b7e87c0bb60a905d",
+      "871ed5a69ca31e6444faa720895949bf", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "68e2f90eaa0ab5da7e6f5776993f7eea", "f1f8282fb33c30eb68c0c315b7a4bc01",
+      "9412064b0eebf8123f23d74147d04dff", "cc08936effe309ab9a4fa1bf7e28e24e",
+      "36cbef36fa21b98df03536c918bf752a", "9d0da6321cf5311ea0bdd41271763030",
+      "55a10165ee8a660d7dddacf7de558cdd", "ac7fc9f9ea7213743fae5a023faaaf08",
+      "077e1b7b355c7ab3ca40230ee8efd8ea", "7a3e8de2a1caae206cf3e51a86dfd15a",
+      "1ddf9020f18fa7883355cf8c0881186a", "2377dd167ef2707978bed6f10ffd4e76",
+      "f918e0e4422967c6a7e47298135c7ae9", "b2264e129636368b5496760b39e64b7a",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "4cfad2c437084a93ea76913e21c2dd89",
+      "d372f0c17bce98855d6d59fbee814c3d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "d99ffd2579eb781c30bc0df7b76ad61e", "4e139e57cbb049a0f4ef816adc48d026",
+      "be53b2507048e7ff50226d15c0b28865", "b73f3c1a10405de89d1f9e812ff73b5a",
+      "c7d51b1f2df49ab83962257e8a5934e5", "159e443d79cc59b11ca4a80aa7aa09be",
+      "6ef14b14882e1465b0482b0e0b16d8ce", "22a8d287b425c870f40c64a50f91ce54",
+      "f1d96db5a2e0a2160df38bd96d28d19b", "637d1e5221422dfe9a6dbcfd7f62ebdd",
+      "f275af4f1f350ffaaf650310cb5dddec", "f81c4d6b001a14584528880fa6988a87",
+      "a5a2f9c2e7759d8a3dec1bc4b56be587", "2317c57ab69a36eb3bf278cf8a8795a3",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "1a0bdfc96a3b9fd904e658f238ab1076",
+      "56d16e54afe205e97527902770e71c71", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "1f7b5b8282ff3cf4d8e8c52d80ef5b4d", "79e9e260a2028c5fe320005c272064b9",
+      "2418ebcdf85551b9ae6e3725f04aae6d", "98bdf907ebacacb734c9eef1ee727c6e",
+      "4dd5672d53c8f359e8f80badaa843dfc", "a1bef519bbf07138e2eec5a91694de46",
+      "df1cb51fe1a937cd7834e973dc5cb814", "317fe65abf81ef3ea07976ef8667baeb",
+      "2da29da97806ae0ee300c5e69c35a4aa", "555475f5d1685638169ab904447e4f13",
+      "b3e3a6234e8045e6182cf90a09f767b2", "849dfeca59074525dea59681a7f88ab4",
+      "39a68af80be11e1682b6f3c4ede33530", "b22d765af176d87e7d3048b4b89b86ad",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "b8a710baa6a9fc784909671d450ecd99",
+      "f9e6a56382d8d12da676d6631bb6ef75", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "3bf8e11e18527b16f0d7c0361d74a52d", "b9ff54c6f1e3b41fc7fc0f3fa0e75cf2",
+      "06ef1504f31af5f173d3317866ca57cb", "635e8ee11cf04d73598549234ad732a0",
+      "fab693410d59ee88aa2895527efc31ac", "3041eb26c23a63a587fbec623919e2d2",
+      "c61d99d5daf575664fb7ad64976f4b03", "822f6c4eb5db760468d822b21f48d94d",
+      "3f6fcb9fae3666e085b9e29002a802fc", "d9b9fecd195736a6049c528d4cb886b5",
+      "fed17fc391e6c3db4aa14ea1d6596c87", "d0d3482d981989e117cbb32fc4550267",
+      "39561688bf6680054edbfae6035316ce", "087c5992ca6f829e1ba4ba5332d67947",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest8bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+      "0291a23f2ac4c40b5d8e957e63769904", "1d48447857472d6455af10d5526f6827",
+      "409b2278d6d372248f1891ca0dd12760", "9e416606a3f82fe5bb3f7182e4f42c2d",
+      "e126563f859ddd5c5ffde6f641168fad", "9bad4f1b7e1865f814b6fd5620816ebd",
+      "50e5e5a57185477cb2af83490c33b47c", "3d2fb301c61d7fbd0e21ac263f7ac552",
+      "5920032c6432c80c6e5e61b684018d13", "07ada64d24339488cdce492e6e0c6b0d",
+      "aaf1589aff6d062a87c627ab9ba20e3e", "91adf91bb24d2c4ea3f882bdf7396e33",
+      "1d17a932a68bb1f199f709e7725fe44b", "07716c63afda034cb386511ea25a63b5",
+      "cca17ef3324c41d189e674a059ef1255", "37d17e70619823a606c0b5f74bf2e33b",
+      "ba8ed5474c187c8e8d7f82a6a29ee860", "27663f037973ebe82ec10252a4d91299",
+      "24c27e187e8d5a2bbfa0fef9046d3eb0", "9854fdc91a48e3bd4639edcc940e5c09",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a71907c60a9f1f81972a2859ae54a805",
+      "817bc3bf0c77abc4186eac39f2320184", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "4e7182a8b226982e2678abcf5f83325d", "50cef7c6e57544135f102226bb95bed9",
+      "225e054dbcfff05b1c8b0792c731449e", "16eb63f03839159f3af0e08be857170f",
+      "c8e5d111a2e3f4487330a8bd893cb894", "4fd99eaf9c160442aab35b9bdc5d275b",
+      "8b0f61bfb30747d4c9215618ac42557c", "1df78022da202cefb9a8100b114152d9",
+      "378466e1eda63dbc03565b78af8e723f", "28ea721411fbf5fc805035be9a384140",
+      "4fed5d4163a3bfcc6726a42f20410b0a", "55abfca0c820771bd926e4b94f66a499",
+      "6c8b8ef0a78859c768e629e1decc0019", "d0ead286b5ba3841d24dd114efbfef0a",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetConvolveDigest10bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+      "b1b6903d60501c7bc11e5285beb26a52", "a7855ed75772d7fa815978a202bbcd9f",
+      "bde291a4e8087c085fe8b3632f4d7351", "238980eebc9e63ae3eea2771c7a70f12",
+      "0eac13431bd7d8a573318408a72246d5", "d05a237ed7a9ca877256b71555b1b8e4",
+      "73438155feb62595e3e406921102d748", "5871e0e88a776840d619670fbf107858",
+      "1c6376ce55c9ee9e35d432edb1ffb3b7", "d675e0195c9feca956e637f3f1959f40",
+      "b5681673903ade13d69e295f82fdd009", "3c43020105ae93a301404b4cd6238654",
+      "dd2c5880a94ed3758bfea0b0e8c78286", "4ebb1a7b25a39d8b9868ec8a1243103f",
+      "d34ec07845cd8523651e5f5112984a14", "2ce55308d873f4cd244f16da2b06e06e",
+      "a4bb5d5ff4b25f391265b5231049a09a", "c9106e0c820b03bcdde3aa94efc11a3e",
+      "7ec2eae9e118506da8b33440b399511a", "78de867c8ee947ed6d29055747f26949",
+      "a693b4bd0334a3b98d45e67d3985bb63", "156de3172d9acf3c7f251cd7a18ad461",
+      "e545b8a3ff958f8363c7968cbae96732", "7842b2047356c1417d9d88219707f1a1",
+      "1a487c658d684314d91bb6d961a94672", "94b3e5bcd6b849b66a4571ec3d23f9be",
+      "0635a296be01b7e641de98ee27c33cd2", "82dc120bf8c2043bc5eee81007309ebf",
+      "58c826cad3c14cdf26a649265758c58b", "f166254037c0dfb140f54cd7b08bddfe",
+      "74ab206f14ac5f62653cd3dd71a7916d", "5621caef7cc1d6522903290ccc5c2cb8",
+      "78ec6cf42cce4b1feb65e076c78ca241", "42188e2dbb4e02cd353552ea147ad03f",
+      "f9813870fc27941a7c00a0443d7c2fe7", "20b14a6b5af7aa356963bcaaf23d230d",
+      "9c9c41435697f75fa118b6d6464ee7cb", "38816245ed832ba313fefafcbed1e5c8",
+      "5d34137cc8ddba75347b0fa1d0a91791", "465dcb046a0449b9dfb3e0b297aa3863",
+      "3e787534dff83c22b3033750e448865a", "4c91f676a054d582bcae1ca9adb87a31",
+      "eab5894046a99ad0a1a12c91b0f37bd7", "765b4cfbfc1a4988878c412d53bcb597",
+      "bc63b29ec78c1efec5543885a45bb822", "91d6bdbc62d4bb80c9b371d9704e3c9e",
+      "cecd57396a0033456408f3f3554c6912", "5b37f94ef136c1eb9a6181c19491459c",
+      "716ba3a25b454e44b46caa42622c128c", "9076f58c4ab20f2f06d701a6b53b1c4f",
+      "d3212ab3922f147c3cf126c3b1aa17f6", "b55fea77f0e14a8bf8b6562b766fe91f",
+      "59b578268ff26a1e21c5b4273f73f852", "16761e7c8ba2645718153bed83ae78f6",
+      "a9e9805769fe1baf5c7933793ccca0d8", "553a2c24939dff18ec5833c77f556cfb",
+      "5c1ec75a160c444fa90abf106fa1140e", "2266840f11ac4c066d941ec473b1a54f",
+      "9e194755b2a37b615a517d5f8746dfbb", "bbf86f8174334f0b8d869fd8d58bf92d",
+      "fd1da8d197cb385f7917cd296d67afb9", "a984202c527b757337c605443f376915",
+      "c347f4a58fd784c5e88c1a23e4ff15d2", "29cbaadbff9adf4a3d49bd9900a9dd0b",
+      "c5997b802a6ba1cf5ba1057ddc5baa7e", "4f750f6375524311d260306deb233861",
+      "59f33727e5beeb783a057770bec7b4cd", "0654d72f22306b28d9ae42515845240c",
+      "6c9d7d9e6ef81d76e775a85c53abe209", "a35f435ccc67717a49251a07e62ae204",
+      "c5325015cb0b7c42839ac4aa21803fa0", "f81f31f1585c0f70438c09e829416f20",
+      "ab10b22fb8dd8199040745565b28595d", "0d928d6111f86c60ccefc6c6604d5659",
+      "4ed1a6200912995d4f571bdb7822aa83", "92e31a45513582f386dc9c22a57bbbbd",
+      "6dbf310a9c8d85f76306d6a35545f8af", "80fce29dc82d5857c1ed5ef2aea16835",
+      "14f2c5b9d2cd621c178a39f1ec0c38eb", "da54cfb4530841bda29966cfa05f4879",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "7e3fa9c03bc3dfbdeb67f24c5d9a49cd",
+      "f3454ca93cbb0c8c09b0695d90a0df3d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "f3454ca93cbb0c8c09b0695d90a0df3d", "1a77d2af4d2b6cf8737cfbcacacdc4e4",
+      "89bec831efea2f88129dedcad06bb3fa", "89bec831efea2f88129dedcad06bb3fa",
+      "dead0fe4030085c22e92d16bb110de9d", "306a2f5dfd675df4ed9af44fd5cac8c0",
+      "306a2f5dfd675df4ed9af44fd5cac8c0", "9d01c946a12f5ef9d9cebd9816e06014",
+      "768f63912e43148c13688d7f23281531", "768f63912e43148c13688d7f23281531",
+      "2e7927158e7b8e40e7269fc909fb584b", "123028e18c2bfb334e34adb5a4f67de4",
+      "123028e18c2bfb334e34adb5a4f67de4", "2c979c2bddef79a760e72a802f83cc76",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "da1a6ff2be03ec8acde4cb1cd519a6f0",
+      "a4ca37cb869a0dbd1c4a2dcc449a8f31", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "1b5d1d4c7be8d5ec00a42a49eecf918f", "98b77e88b0784baaea64c98c8707fe46",
+      "8148788044522edc3c497e1017efe2ce", "acf60abeda98bbea161139b915317423",
+      "262c96b1f2c4f85c86c0e9c77fedff1e", "f35a3d13516440f9168076d9b07c9e98",
+      "13782526fc2726100cb3cf375b3150ed", "13c07441b47b0c1ed80f015ac302d220",
+      "02880fde51ac991ad18d8986f4e5145c", "aa25073115bad49432953254e7dce0bc",
+      "69e3361b7199e10e75685b90fb0df623", "2f8ab35f6e7030e82ca922a68b29af4a",
+      "452f91b01833c57db4e909575a029ff6", "1fabf0655bedb671e4d7287fec8119ba",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "d54206c34785cc3d8a06c2ceac46378c",
+      "85a11892ed884e3e74968435f6b16e64", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "16434230d24b9522ae2680e8c37e1b95", "963dea92f3efbb99137d1de9c56728d3",
+      "b72fb6a9a073c2fe65013af1842dc9b0", "86fa0c299737eb499cbcdce94abe2d33",
+      "6b80af04470b83673d98f46925e678a5", "65baca6167fe5249f7a839ce5b2fd591",
+      "e47ded6c0eec1d5baadd02aff172f2b1", "c0950e609f278efb7050d319a9756bb3",
+      "9051290279237f9fb1389989b142d2dd", "34cdc1be291c95981c98812c5c343a15",
+      "5b64a6911cb7c3d60bb8f961ed9782a2", "7133de9d03a4b07716a12226b5e493e8",
+      "3594eff52d5ed875bd9655ddbf106fae", "90d7e13aa2f9a064493ff2b3b5b12109",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "b1f26ee13df2e14a757416ba8a682278",
+      "996b6c166f9ed25bd07ea6acdf7597ff", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "34895d4c69a6c3303693e6f431bcd5d8", "c9497b00cb1bc3363dd126ffdddadc8e",
+      "1e461869bb2ee9b6069c5e52cf817291", "8d7f1d7ea6a0dcc922ad5d2e77bc74dd",
+      "138855d9bf0ccd0c62ac14c7bff4fd37", "64035142864914d05a48ef8e013631d0",
+      "205904fa3c644433b46e01c11dd2fe40", "291425aaf8206b20e88db8ebf3cf7e7f",
+      "cb6238b8eb6b72980958e6fcceb2f2eb", "626321a6dfac542d0fc70321fac13ff3",
+      "1c6fda7501e0f8bdad972f7857cd9354", "4fd485dadcb570e5a0a5addaf9ba84da",
+      "d3f140aea9e8eabf4e1e5190e0148288", "e4938219593bbed5ae638a93f2f4a580",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "238980eebc9e63ae3eea2771c7a70f12",
+      "0eac13431bd7d8a573318408a72246d5", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "73438155feb62595e3e406921102d748", "5871e0e88a776840d619670fbf107858",
+      "1c6376ce55c9ee9e35d432edb1ffb3b7", "d675e0195c9feca956e637f3f1959f40",
+      "b5681673903ade13d69e295f82fdd009", "3c43020105ae93a301404b4cd6238654",
+      "dd2c5880a94ed3758bfea0b0e8c78286", "4ebb1a7b25a39d8b9868ec8a1243103f",
+      "d34ec07845cd8523651e5f5112984a14", "2ce55308d873f4cd244f16da2b06e06e",
+      "a4bb5d5ff4b25f391265b5231049a09a", "c9106e0c820b03bcdde3aa94efc11a3e",
+      "7ec2eae9e118506da8b33440b399511a", "78de867c8ee947ed6d29055747f26949",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "e552466a4e7ff187251b8914b084d404",
+      "981b7c44b6f7b7ac2acf0cc4096e6bf4", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "a4c75372af36162831cb872e24e1088c", "497271227a70a72f9ad25b415d41563f",
+      "c48bd7e11ec44ba7b2bc8b6a04592439", "0960a9af91250e9faa1eaac32227bf6f",
+      "746c2e0f96ae2246d534d67102be068c", "d6f6db079da9b8909a153c07cc9d0e63",
+      "7c8928a0d769f4264d195f39cb68a772", "db645c96fc8be04015e0eb538afec9ae",
+      "946af3a8f5362def5f4e27cb0fd4e754", "7ad78dfe7bbedf696dd58d9ad01bcfba",
+      "f0fd9c09d454e4ce918faa97e9ac10be", "af6ae5c0eb28417bd251184baf2eaba7",
+      "866f8df540dd3b58ab1339314d139cbd", "72803589b453a29501540aeddc23e6f4",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "aba5d5ef5e96fe418e65d20e506ea834",
+      "d70bf16e2a31e90b7b3cdeaef1494cf9", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "6df80bb7f264f4f285d09a4d61533fae", "c8831118d1004a7cca015a4fca140018",
+      "b7f82c140369067c105c7967c75b6f9e", "130f47aae365aabfec4360fa5b5ff554",
+      "92483ed631de21b685ffe6ccadbbec8f", "cbb6ab31547df6b91cfb48630fdffb48",
+      "1eea5e8a24d6aa11778eb3e5e5e9c9f2", "9e193b6b28ce798c44c744efde19eee9",
+      "885c384d90aaa34acd8303958033c252", "8110ed10e7234851dff3c7e4a51108a2",
+      "6fb9383302eb7e7a13387464d2634e03", "864d51fcc737bc73a3f588b67515039a",
+      "2ecb7890f00234bcb28c1d969f489012", "c4793d431dbf2d88826bb440bf027512",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "972aeba65e8a6d20dd0f95279be2aa75",
+      "34165457282e2af2e9b3f5840e4dec5d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "b8c5582b9bbb789c45471f93be83b41f", "257bf5467db570974d7cf2356bacf116",
+      "5255dded79f56b0078543b5a1814a668", "ef745100f5f34c8ff841b2b0b57eb33f",
+      "edae8ed67286ca6a31573a541b3deb6f", "01adcd8bf15fbf70df47fbf3a953aa14",
+      "ba539808a8501609ce052a1562a62b25", "ac8e6391200cec2abdebb00744a2ba82",
+      "54b17120f7d71ddb4d70590ecd231cc1", "f6e36446a97611a4db4425df926974b2",
+      "a82f4080699300b659bbe1b5c4463147", "ecedb178f7cad3dc1b921eca67f9efb6",
+      "0609ca0ff3ca90069e8b48829b4b0891", "839e86c681e97359f7819c766000dd1c",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest10bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+      "27e21eb31687f9fbd0a66865fa8d7c8a", "9bff726c8e1d0998451a3b9cf2b3d8c8",
+      "661d74cfef36f12ed8d9b4c3ccb7fe0d", "5fc365fd1fcc9599dd97a885ba0c2eec",
+      "acdba2c82a6268e3c0ae8fc32be1b41f", "a5db60bbeaf56ab030ed21c42d553cf3",
+      "1228bb633f9fd63fdb998b775ca79e98", "07812c97f9f43a2a8ae07329dc488699",
+      "903525fb782119c4dfaf61b98a310c9f", "f38b51cef38b929e317861ccbc73ecd8",
+      "b78b05138e1d5fbf089144c42ce03058", "f2e227664cbf2d821b242a34fcbc9835",
+      "cb992dac70591e7d3663588ae13b9adc", "f2292d33657d939fa85ea5bacdfe39a3",
+      "7049dc742d6d8ad6f5d4309968ff281c", "e4beebde1ac335a4d92e4af94653a2ce",
+      "cc77875f98f54b9b26b5f7d9fcbc828d", "fb623f7b9e1ffcf2ae361599728a5589",
+      "c33847e47a7eda214734084640818df9", "ab3e1aec3d720c0c89c46a8d5b161b44",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "efe4de861dcf0f7458b6208cae7e3584",
+      "814751c55fa84f0fed94ff15fc30fc24", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "31a63fe47297102937acbe7a328588b7", "b804a0a24633243f7da48d7a5f51c0bf",
+      "cb492672b005fc378cccc8c03003cd4a", "1d18732bcf2ea487e84579489cc59a22",
+      "457c4b3ec38a8d6c210584ade1a9fae2", "a3afdd468e6a5238a3dbd2cc21c11c9e",
+      "6ff8a16f21d6e8a9741dacf0734ae563", "3ffa29ef7e54e51f6849c9a3d3c79d03",
+      "af89899b083cf269ac1bd988aeb15b15", "3365d8411c11081fb228436238b9a671",
+      "3ba56d30f5f81d7098f356635a58b9af", "b3013776900c6520bd30f868e8c963b6",
+      "81febaa7342692483040f500ba2e5e2b", "4a51ff1d9a4a68687d590b41aa7835a3",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetConvolveDigest12bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+      "e25031afae184cc4d186cde7e3d51e33", "6fb55dec2506dae6c229469cdf2e7d83",
+      "9df34d27f5bd040d1ed1455b151cd1ff", "7f6829458f00edb88f78851dd1a08739",
+      "a8bbe9b6b9eaf6f681d91c981b994949", "21f74980b36cb246426f4bc3fe7c08c3",
+      "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5",
+      "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769",
+      "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68",
+      "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86",
+      "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8",
+      "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed",
+      "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998",
+      "291e67095399651dc5c8a033390f255f", "66b26828e434faf37ddc57d3e0abb6db",
+      "e9cd69e9ba70864e3d0b175ac0a177d6", "64e4db895a843cb05384f5997b1ba978",
+      "f305161c82de999d2c93eac65f609cfe", "4762b2bd27983ad916ec0a930c0eca6b",
+      "1631495ffae43a927267ebd476015ef1", "b0f22de7b10057e07af71f9bce4615ce",
+      "6fa29dc4be1a46d246a41d66a3d35cb4", "734601c2185bdf30ba9ded8b07003a05",
+      "524e4553d92c69e7e4ed934f7b806c6b", "3709c8950bc5fcc4a2b3ec68fc78bf7e",
+      "69c274d9f8e0fd6790495e9695251f1f", "ee30cc1232c27494ef53edd383568f25",
+      "e525dbeb0b4341952a92270dcfc51730", "b3685c9e783d3402497bbd49d28c7dd7",
+      "d1c9f02dc818e6b974794dfb7749aac8", "bdb9e4961f9aa8c25568d3394e968518",
+      "f5f74555adcad85f3ebd3cb85dc7b770", "737e2a0be806dbd701014f2078be7898",
+      "20a18294e3a9422193aa0a219fd80ede", "7106648ecb9ae24a54d1dbabf2a9e318",
+      "20f39cbd6b5ed87a6ae4f818932325c0", "a99666e3157e32a07c87b01e52091a76",
+      "123e4d533d478c3089c975323c85396b", "d2a8021f7683a0cdf2658418fa90a6fc",
+      "1437e192a3349db8702d5b90eb88dbc1", "fe097fc4aeed7cda0b0f405124efb19d",
+      "1988227c51fa589db1307fd890bb5972", "537e25a6c30b240dc1e3bddd1c3a0a03",
+      "aebe5cffaae448db5a08987a3375a428", "7127ae9bdc63df4459590dc02ca95403",
+      "7ad281903a210f2b1f39f7c40c8df272", "d4b97ba21f7b400ba9f9cd8bb1a576df",
+      "0884a824203aaf72c78f73fdaf2b23a2", "63d60388605c92daee55d517de622a9e",
+      "171ec49a779de1efa69510eefbd09bba", "541cf051579c5a10b9debd3bfdcb7f32",
+      "91c14451ad93ed88e96b5d639ce408de", "3b0313ec0e043d19744bf88c90e875a1",
+      "6adcb3cee91fe3a83b36deb11c5ad6dd", "c6d4bfad24616a88222681992a99d782",
+      "515dc0f2a41730d5c434e4f3c81b02c3", "1c69abdee3b9608a6094034badc2bec0",
+      "1785a0f321d7dd90aa8846961737a767", "dd12c5b8c341f2423d0d5db4f285d199",
+      "5741fb69aae1ca8a0fbe4f1478df88ef", "a4390ceb4e4e9f5cf6a47a9b11a97015",
+      "6778eb25df902092b440c3402e7f0f80", "5ad9d6b36f8898bb55e901c1c0c523da",
+      "73969b6c03bb5a7345a8b968b542668e", "f48192947e66d70f116193a4186d0186",
+      "53f60d0e89d7d994ec6d6131fb7e75ae", "c75f6f8813839ae3cf192baa29039265",
+      "9ff0852ebbad56663250f86ac3a3bf9b", "668938580a770ea7ace8bbf7d349e89f",
+      "5b06bb0a15ac465a250d9b209f05289f", "a2128f5c8692fed7e7c1c7af22ce9f72",
+      "f80f1d7a58869ec794258c0f7df14620", "ed1e03a35924c92ed2fc9808dc3f06f3",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "35ef89c35d2e8e46feb856c554c21c9f",
+      "b98ce33a1bf4fab840b7ef261b30dbc4", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "b98ce33a1bf4fab840b7ef261b30dbc4", "42263fb359c4fdf1c7cdb4980b3e97f2",
+      "1e7071b7db3144188bdcf5d199fe5355", "1e7071b7db3144188bdcf5d199fe5355",
+      "30d367304a87bd25f0ad2ff8e4b5eb41", "4abe6dbb3198219015838dbedf07297a",
+      "4abe6dbb3198219015838dbedf07297a", "acec349a95b5bba98bb830372fa15e73",
+      "a73ad8661256ce2fdf5110425eb260b2", "a73ad8661256ce2fdf5110425eb260b2",
+      "8ff2f049d3f972867f14775188fe589b", "87f5f9a07aea75c325e6d7ff6c96c7c2",
+      "87f5f9a07aea75c325e6d7ff6c96c7c2", "325fcde7d415d7aa4929a3ea013fb9cc",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "05aa29814d5ce35389dbcf20368850da",
+      "fbb89f907a040e70953e3364dbe1feda", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "44ac511baf45032078cc0b45e41dba79", "efb98974adc58d88e122293436bb9184",
+      "7eee18c1a16bcb4e7ef7b27f68ba884f", "b0904c9b118dd9a1f9f034c0ff82d1c1",
+      "54436deb5183dd9669dd4f5feadb3850", "4db1c310b7d9a8bd3e2b5d20fa820e3b",
+      "c40abc6b2d67527f48a287cd7e157428", "48ec3fcf509805f484c8e0948c3469be",
+      "cb7d4a76fa7de52ed2fe889785327b38", "f57983346815fa41e969c195c1c03774",
+      "7dba59b0de2c877666ded6bdaefdcc30", "4837f8ba2f67f17f28a38c5e2a434c73",
+      "09e06fe9dc7ef7818f2a96895235afd4", "002976970ec62b360f956b9c091782d4",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "78673b1084367e97b8dd83990adc5219",
+      "06b5d4a30b9efb6c1d95ef7957f49e76", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "ce460146922cd53970b510d460aa4062", "6fd051938b8efcec9ece042f1edc177a",
+      "f5ff0dcfe3c1a56e3856549d8ded416b", "b69bc2cfc17c6b4313264db96831f0d1",
+      "38a5e65bd71934becfb376eb3b9bc513", "32c1163aa4ca6b6c69d950aba7b06d48",
+      "0c22a6c014c6347983de4ca863f3b53f", "a80c5ee9eb2dfb9a0d56e92eb3f85d91",
+      "a9719722a150a81175427bc161b95d7a", "8237befd456131a488cc5b8b63f4aca5",
+      "51616abcd0beea53a78ffce106b974fc", "6c47b22270f01d27b404da07e1be1202",
+      "356268298d3887edaabd0169a912c94e", "d2b00216e106cb8c5450e2eff1f8481a",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "c2de3a582c79aee811076211c497d2bc",
+      "d1b6d9c73da41def26dd4f85fbd1bde8", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "d8374eb7825081b89f74b05c66bccd63", "d5f7d68c10b5eaf0fba6f93ee26266e6",
+      "94d19cb65f29db65e6656b588f431ade", "5126e95f0249024a6f6d426714bd5b20",
+      "d7d3654b9c2dabe13239875984770acd", "6491afd5d651aab80aa179b579b74341",
+      "037a5de0de89983808f8e8f6dc39110f", "5980073b7685c5c9b2ec027e06be2cbc",
+      "0abb9d035aca426b62ca0f3fab063bab", "fe002a902bb4ec24dfe3ea0fe381a472",
+      "1ac15726df1aa2cd8855162a91893379", "0758c3ac16467605d73c725a697c3dc1",
+      "97d894d85f6ccfa4ff81e0e8fdf03da1", "c3c7b362f063a18244ea542a42d2873c",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "7f6829458f00edb88f78851dd1a08739",
+      "a8bbe9b6b9eaf6f681d91c981b994949", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5",
+      "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769",
+      "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68",
+      "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86",
+      "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8",
+      "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed",
+      "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "67b2ea94cc4d0b32db3ae3c29eee4d46",
+      "bcfec99ad75988fa1efc1733204f17f2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "79c222c5796e50119f1921e7bc534a25", "ae3f7c189458f044e9c52399d37a55e2",
+      "fd6dde45bd2937331428de9ef4f8e869", "b384d065423f3d271b85781d76a73218",
+      "466ea0680c06f59e8b3bb293608731fb", "360541ba94f42d115fe687a97a457ffb",
+      "e5a0794d37af40c40a4d2c6d3f7d2aa2", "4eed285651a75614bd60adebbe2e185c",
+      "bbdbf93942282d7b9c4f07591a1764a6", "1288a9ec3e6f79213b6745e6e7568c44",
+      "4ff1310bfd656d69ed5c108a91a9b01a", "3380806b5f67eb3ebce42f8e7c05b256",
+      "09c4bdf0f30aca6812fb55a5ac06b1bd", "722c86ba6bf21f40742ee33b4edc17c4",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "f5303c96d1630f9840eaaba058cd818b",
+      "c20cd45782b2f52c05e4189912047570", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "d6360f96fe15a3ee1e903b0a53dcaaeb", "4b18995cdf2f5d18410d3a732c5932b1",
+      "6f62bf7395c3dfccc1565ba8424f20e8", "c9987ed30491cd28bbc711dd57228247",
+      "8e277ec837cbecf529ae2eb0578fddc1", "c0c132386f23c5f0fba055a12fb79547",
+      "6b5617ab78dd0916690dfa358298b7b3", "394abedca37f60d1a5148a4c975305ed",
+      "bb88881e0e4cf2d88c2d2b38b5833f20", "bef10806be8d58ea8e97870a813b075e",
+      "b4b017d1f792bea69d3b773db7c80c7c", "0660bc63041213a8a4d74724a3bc4291",
+      "5050c8c5388a561691fd414b00c041df", "9ed40c68de6a8008a902d7224f8b620f",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "ec10ce4a674424478a401847f744251d",
+      "bdd897eafc8ef2651a7bba5e523a6ac2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "2745de4a6b29abb85ee513e22ad362c3", "8aaad384b7cd349b4b968e657ec15523",
+      "fb6c0723432bcd2246d51a90f5fb5826", "f8104ed5921ebd48c6eed16150ffe028",
+      "85c2e236b3e32bf731601237cf0594cd", "8bd6eefff9640766cdf64ab082cb1485",
+      "78b5cd9dde6c6a5900f3040c57172091", "aaa980506bd7bb1d75924a8025698d1a",
+      "90050a411d501f7166f6741832b0c342", "d6ec88b2c38e32511f3359e1d5f9d85b",
+      "96506b8b39274c8fe687ea39761997f1", "3322ea83995c2762fb60db993b401658",
+      "151b6e4ce60392639982fca5a73ac3d3", "d52a1038e135bef233674a843f8c7cb6",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest12bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+      "aea59b7a638f27acad2b90fd2b8c9fee", "be87ba981a0af25611a7d5f0970be9b3",
+      "7c81f1486cd607376d776bf2c6e81dec", "f683ba2a9b353bea35f26c1ed730f3c5",
+      "11e2d70daff1726093cb4fcae33ce0d6", "567575eac0dea2f379019b2d4bafe444",
+      "216479ed580d6e0d7c1d523015394814", "dcabbe5f5709a4b6634d77cc514e863a",
+      "4e888207fe917faeea2b44383ac16caf", "d617c5608fae3b01c507c7e88040fee3",
+      "eeac5d9b3dc005e76f13dfc7483eae48", "8ff0a82811f77303c4516bb8c761336f",
+      "95a7c315aaa208097b6ab006f1d07654", "da63527ee80e6772435cff8321a29a95",
+      "404457f72e7113d1f3797a39319fd3fe", "43cbccfe2663ec11c157319acfe629a5",
+      "1dc5b8dee4542f3d7fcf6b0fa325dfde", "16d4506674f2fcedfcd1e006eb097141",
+      "4fcf329ddb405cd6bbb0a6fb87e29eb3", "de77a781957653ea1750f79995605cdc",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "436f6fdc008d94a94bc6f516f98f402f",
+      "b436bd9036f08ba7e50cfc536911dbbd", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "720a01018856bd83f4d89a9024b14728", "b7e01a3f161007712ce342f59b2c51f2",
+      "922420ebe5dec4f19c259ebdf8a3259a", "979aaba579556207a7bbcc939123c1b2",
+      "89a30898cbaa4d64f9072173e8365864", "0586ff961f2e4228f4e38299fb25ae07",
+      "adb27a03f8b1b50fe2a52b5ca8d4fc28", "4f91cd92aab2e09f4b123251a8d2f219",
+      "620fba0fff163d96a1cd663d1af4a4c5", "bf7a0fa65b1a90ba34c834558fa2c86e",
+      "c21f7d7d16d047a27b871a7bf8476e2d", "a94b17c81f3ce2b47081bd8dd762a2e5",
+      "9078d20f59bc24862af3856acb8c0357", "ee510ce6b3d22de9e4bd7920a26fd69a",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct ConvolveTypeParam {
+  ConvolveTypeParam(bool is_intra_block_copy, bool is_compound,
+                    bool has_vertical_filter, bool has_horizontal_filter)
+      : is_intra_block_copy(is_intra_block_copy),
+        is_compound(is_compound),
+        has_vertical_filter(has_vertical_filter),
+        has_horizontal_filter(has_horizontal_filter) {}
+  bool is_intra_block_copy;
+  bool is_compound;
+  bool has_vertical_filter;
+  bool has_horizontal_filter;
+};
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTypeParam& param) {
+  return os << "is_intra_block_copy: " << param.is_intra_block_copy
+            << ", is_compound: " << param.is_compound
+            << ", has_(vertical/horizontal)_filter: "
+            << param.has_vertical_filter << "/" << param.has_horizontal_filter;
+}
+
+//------------------------------------------------------------------------------
+template <int bitdepth, typename Pixel>
+class ConvolveTest : public testing::TestWithParam<
+                         std::tuple<ConvolveTypeParam, ConvolveTestParam>> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  ConvolveTest() = default;
+  ~ConvolveTest() override = default;
+
+  void SetUp() override {
+    ConvolveInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    GetConvolveFunc(dsp, &base_convolve_func_);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_convolve_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      ConvolveInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+      ConvolveInit_AVX2();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ConvolveInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ConvolveInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    GetConvolveFunc(dsp, &cur_convolve_func_);
+
+    // Skip functions that have not been specialized for this particular
+    // architecture.
+    if (cur_convolve_func_ == base_convolve_func_) {
+      cur_convolve_func_ = nullptr;
+    }
+  }
+
+ protected:
+  int GetDigestId() const {
+    int id = param_.block_size;
+    id += param_.kNumBlockSizes *
+          static_cast<int>(type_param_.has_horizontal_filter);
+    id += 2 * param_.kNumBlockSizes *
+          static_cast<int>(type_param_.has_vertical_filter);
+    id += 4 * param_.kNumBlockSizes * static_cast<int>(type_param_.is_compound);
+    id += 8 * param_.kNumBlockSizes *
+          static_cast<int>(type_param_.is_intra_block_copy);
+    return id;
+  }
+
+  void GetConvolveFunc(const Dsp* dsp, ConvolveFunc* func);
+  void SetInputData(bool use_fixed_values, int value);
+  void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+             libvpx_test::MD5* md5_digest);
+  void Check16Bit(bool use_fixed_values, const uint16_t* src,
+                  const uint16_t* dest, libvpx_test::MD5* md5_digest);
+  // |num_runs| covers the categories of filters (6) and the number of filters
+  // under each category (16).
+  void Test(bool use_fixed_values, int value,
+            int num_runs = kMinimumViableRuns);
+
+  const ConvolveTypeParam type_param_ = std::get<0>(GetParam());
+  const ConvolveTestParam param_ = std::get<1>(GetParam());
+
+ private:
+  ConvolveFunc base_convolve_func_;
+  ConvolveFunc cur_convolve_func_;
+  // Convolve filters are 7-tap, which need 3 pixels
+  // (kRestorationHorizontalBorder) padding.
+  Pixel source_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+  const int source_stride_ = kMaxBlockWidth;
+  const int source_height_ = kMaxBlockHeight;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::GetConvolveFunc(const Dsp* const dsp,
+                                                    ConvolveFunc* func) {
+  *func =
+      dsp->convolve[type_param_.is_intra_block_copy][type_param_.is_compound]
+                   [type_param_.has_vertical_filter]
+                   [type_param_.has_horizontal_filter];
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                 int value) {
+  if (use_fixed_values) {
+    std::fill(source_, source_ + source_height_ * source_stride_, value);
+  } else {
+    const int offset =
+        kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int height = param_.height;
+    const int width = param_.width;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+      }
+    }
+    // Copy border pixels to the left and right borders.
+    for (int y = 0; y < height; ++y) {
+      Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+             source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+      Memset(&source_[y * source_stride_ + offset + width],
+             source_[y * source_stride_ + offset + width - 1],
+             kConvolveBorderLeftTop);
+    }
+    // Copy border pixels to the top and bottom borders.
+    for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+      memcpy(&source_[y * source_stride_],
+             &source_[kConvolveBorderLeftTop * source_stride_],
+             source_stride_ * sizeof(Pixel));
+      memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+             &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+             source_stride_ * sizeof(Pixel));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+                                          const Pixel* src, const Pixel* dest,
+                                          libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check16Bit(bool use_fixed_values,
+                                               const uint16_t* src,
+                                               const uint16_t* dest,
+                                               libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Test(
+    bool use_fixed_values, int value, int num_runs /*= kMinimumViableRuns*/) {
+  // There's no meaning testing fixed input in compound convolve.
+  if (type_param_.is_compound && use_fixed_values) return;
+
+  // There should not be any function set for this combination.
+  if (type_param_.is_intra_block_copy && type_param_.is_compound) {
+    ASSERT_EQ(cur_convolve_func_, nullptr);
+    return;
+  }
+
+  // Compound and intra block copy functions are only used for blocks 4x4 or
+  // greater.
+  if (type_param_.is_compound || type_param_.is_intra_block_copy) {
+    if (param_.width < 4 || param_.height < 4) {
+      GTEST_SKIP();
+    }
+  }
+
+  // Skip unspecialized functions.
+  if (cur_convolve_func_ == nullptr) {
+    GTEST_SKIP();
+  }
+
+  SetInputData(use_fixed_values, value);
+  int subpixel_x = 0;
+  int subpixel_y = 0;
+  int vertical_index = 0;
+  int horizontal_index = 0;
+  const int offset =
+      kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+  const Pixel* const src = source_ + offset;
+  const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+  const ptrdiff_t src_stride_16 = source_stride_;
+  const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+  // Pack Compound output since we control the predictor buffer.
+  const ptrdiff_t dst_stride_compound = param_.width;
+
+  // Output is always 16 bits regardless of |bitdepth|.
+  uint16_t* dst_16 = dest_16bit_ + offset;
+  // Output depends on |bitdepth|.
+  Pixel* dst_pixel = dest_clipped_ + offset;
+
+  // Collect the first |kMinimumViableRuns| into one md5 buffer.
+  libvpx_test::MD5 md5_digest;
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    // Test every filter.
+    // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+    subpixel_x += 1 << 6;
+    subpixel_y += 1 << 6;
+
+    const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+    const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+    // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+    // function.
+    if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+      continue;
+    }
+
+    // For focused speed testing these can be set to the desired filter. Want
+    // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+    vertical_index += static_cast<int>(i % 16 == 0);
+    vertical_index %= 4;
+    horizontal_index += static_cast<int>(i % 16 == 0);
+    horizontal_index %= 4;
+
+    if (type_param_.is_compound) {
+      // Output type is uint16_t.
+      const absl::Time start = absl::Now();
+      cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+                         horizontal_filter_id, vertical_filter_id, param_.width,
+                         param_.height, dst_16, dst_stride_compound);
+      elapsed_time += absl::Now() - start;
+    } else {
+      // Output type is Pixel.
+      const absl::Time start = absl::Now();
+      cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+                         horizontal_filter_id, vertical_filter_id, param_.width,
+                         param_.height, dst_pixel, dst_stride);
+      elapsed_time += absl::Now() - start;
+    }
+
+    // Only check the output for the first set. After that it's just repeated
+    // runs for speed timing.
+    if (i >= kMinimumViableRuns) continue;
+
+    if (type_param_.is_compound) {
+      // Need to copy source to a uint16_t buffer for comparison.
+      Pixel* src_ptr = source_;
+      uint16_t* src_ptr_16 = source_16bit_;
+      for (int y = 0; y < kMaxBlockHeight; ++y) {
+        for (int x = 0; x < kMaxBlockWidth; ++x) {
+          src_ptr_16[x] = src_ptr[x];
+        }
+        src_ptr += src_stride_16;
+        src_ptr_16 += src_stride_16;
+      }
+
+      Check16Bit(use_fixed_values, source_16bit_ + offset, dst_16, &md5_digest);
+    } else {
+      Check(use_fixed_values, src, dst_pixel, &md5_digest);
+    }
+  }
+
+  if (!use_fixed_values) {
+    // md5 sums are only calculated for random input.
+    const char* ref_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        ref_digest = GetConvolveDigest8bpp(GetDigestId());
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        ref_digest = GetConvolveDigest10bpp(GetDigestId());
+        break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+      case 12:
+        ref_digest = GetConvolveDigest12bpp(GetDigestId());
+        break;
+#endif
+    }
+    ASSERT_NE(ref_digest, nullptr);
+
+    const char* direction;
+    if (type_param_.has_vertical_filter && type_param_.has_horizontal_filter) {
+      direction = "2D";
+    } else if (type_param_.has_vertical_filter) {
+      direction = "Vertical";
+    } else if (type_param_.has_horizontal_filter) {
+      direction = "Horizontal";
+    } else {
+      direction = "Copy";
+    }
+    const auto elapsed_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+    printf("Mode Convolve%s%s%s[%25s]: %5d us MD5: %s\n",
+           type_param_.is_compound ? "Compound" : "",
+           type_param_.is_intra_block_copy ? "IntraBlockCopy" : "", direction,
+           absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+           elapsed_time_us, md5_digest.Get());
+    EXPECT_STREQ(ref_digest, md5_digest.Get());
+  }
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+                              const int8_t filter[kSubPixelTaps],
+                              int* min_output, int* max_output) {
+  int min = 0, max = 0;
+  for (int i = 0; i < kSubPixelTaps; ++i) {
+    const int tap = filter[i];
+    if (tap > 0) {
+      max += max_input * tap;
+      min += min_input * tap;
+    } else {
+      min += max_input * tap;
+      max += min_input * tap;
+    }
+  }
+  *min_output = min;
+  *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+                                const int8_t filter[kSubPixelTaps],
+                                int* min_output, int* max_output) {
+  ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Convolve process.
+template <int bitdepth>
+void ShowRange() {
+  // Subtract one from the shift bits because the filter is pre-shifted by 1.
+  constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+                                      ? kInterRoundBitsHorizontal12bpp - 1
+                                      : kInterRoundBitsHorizontal - 1;
+  constexpr int vertical_bits = (bitdepth == kBitdepth12)
+                                    ? kInterRoundBitsVertical12bpp - 1
+                                    : kInterRoundBitsVertical - 1;
+  constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical - 1;
+
+  constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+  constexpr int max_input = (1 << bitdepth) - 1;
+
+  const int8_t* worst_convolve_filter = kHalfSubPixelFilters[2][8];
+
+  // First pass.
+  printf("Bitdepth: %2d Input range:            [%8d, %8d]\n", bitdepth, 0,
+         max_input);
+
+  int min, max;
+  ApplyFilterToUnsignedInput(max_input, worst_convolve_filter, &min, &max);
+
+  if (bitdepth == 8) {
+    // 8bpp can use int16_t for sums.
+    assert(min > INT16_MIN);
+    assert(max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  Horizontal upscaled range:         [%8d, %8d]\n", min, max);
+
+  const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+  const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+  // All bitdepths can use int16_t for first pass output.
+  assert(first_pass_min > INT16_MIN);
+  assert(first_pass_max < INT16_MAX);
+
+  printf("  Horizontal downscaled range:       [%8d, %8d]\n", first_pass_min,
+         first_pass_max);
+
+  // Second pass.
+  ApplyFilterToSignedInput(first_pass_min, first_pass_max,
+                           worst_convolve_filter, &min, &max);
+
+  // All bitdepths require int32_t for second pass sums.
+  assert(min < INT16_MIN && min > INT32_MIN);
+  assert(max > INT16_MAX && max < INT32_MAX);
+
+  printf("  Vertical upscaled range:           [%8d, %8d]\n", min, max);
+
+  // Second pass non-compound output is clipped to Pixel values.
+  const int second_pass_min =
+      Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+  const int second_pass_max =
+      Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+  printf("  Pixel output range:                [%8d, %8d]\n", second_pass_min,
+         second_pass_max);
+
+  // Output is Pixel so matches Pixel values.
+  assert(second_pass_min == 0);
+  assert(second_pass_max == max_input);
+
+  const int compound_second_pass_min =
+      RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+  const int compound_second_pass_max =
+      RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+  printf("  Compound output range:             [%8d, %8d]\n",
+         compound_second_pass_min, compound_second_pass_max);
+
+  if (bitdepth == 8) {
+    // 8bpp output is int16_t without an offset.
+    assert(compound_second_pass_min > INT16_MIN);
+    assert(compound_second_pass_max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp use the offset to fit inside uint16_t.
+    assert(compound_second_pass_min > 0);
+    assert(compound_second_pass_max < UINT16_MAX);
+  }
+
+  printf("\n");
+}
+
+TEST(ConvolveTest, ShowRange) {
+  ShowRange<kBitdepth8>();
+  ShowRange<kBitdepth10>();
+  ShowRange<kBitdepth12>();
+}
+
+using ConvolveTest8bpp = ConvolveTest<8, uint8_t>;
+
+TEST_P(ConvolveTest8bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, 255);
+}
+
+TEST_P(ConvolveTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest8bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+//------------------------------------------------------------------------------
+template <int bitdepth, typename Pixel>
+class ConvolveScaleTest
+    : public testing::TestWithParam<
+          std::tuple<bool /*is_compound*/, ConvolveTestParam>> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  ConvolveScaleTest() = default;
+  ~ConvolveScaleTest() override = default;
+
+  void SetUp() override {
+    ConvolveInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_convolve_scale_func_ = dsp->convolve_scale[is_compound_];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_convolve_scale_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      ConvolveInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+      ConvolveInit_AVX2();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ConvolveInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ConvolveInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_convolve_scale_func_ = dsp->convolve_scale[is_compound_];
+
+    // Skip functions that have not been specialized for this particular
+    // architecture.
+    if (cur_convolve_scale_func_ == base_convolve_scale_func_) {
+      cur_convolve_scale_func_ = nullptr;
+    }
+  }
+
+ protected:
+  int GetDigestId() const {
+    return param_.block_size +
+           param_.kNumBlockSizes * static_cast<int>(is_compound_);
+  }
+
+  void SetInputData(bool use_fixed_values, int value);
+  void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+             libvpx_test::MD5* md5_digest);
+  void Check16Bit(bool use_fixed_values, const uint16_t* src,
+                  const uint16_t* dest, libvpx_test::MD5* md5_digest);
+  // |num_runs| covers the categories of filters (6) and the number of filters
+  // under each category (16).
+  void Test(bool use_fixed_values, int value,
+            int num_runs = kMinimumViableRuns);
+
+  const bool is_compound_ = std::get<0>(GetParam());
+  const ConvolveTestParam param_ = std::get<1>(GetParam());
+
+ private:
+  ConvolveScaleFunc base_convolve_scale_func_;
+  ConvolveScaleFunc cur_convolve_scale_func_;
+  // Convolve filters are 7-tap, which need 3 pixels
+  // (kRestorationHorizontalBorder) padding.
+  // The source can be at most 2 times of max width/height.
+  Pixel source_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+  uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+  uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+  const int source_stride_ = kMaxBlockWidth * 2;
+  const int source_height_ = kMaxBlockHeight * 2;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                      int value) {
+  if (use_fixed_values) {
+    std::fill(source_, source_ + source_height_ * source_stride_, value);
+  } else {
+    const int offset =
+        kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int height = param_.height * 2;
+    const int width = param_.width * 2;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+      }
+    }
+    // Copy border pixels to the left and right borders.
+    for (int y = 0; y < height; ++y) {
+      Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+             source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+      Memset(&source_[y * source_stride_ + offset + width],
+             source_[y * source_stride_ + offset + width - 1],
+             kConvolveBorderLeftTop);
+    }
+    // Copy border pixels to the top and bottom borders.
+    for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+      memcpy(&source_[y * source_stride_],
+             &source_[kConvolveBorderLeftTop * source_stride_],
+             source_stride_ * sizeof(Pixel));
+      memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+             &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+             source_stride_ * sizeof(Pixel));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+                                               const Pixel* src,
+                                               const Pixel* dest,
+                                               libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Check16Bit(
+    bool use_fixed_values, const uint16_t* src, const uint16_t* dest,
+    libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Test(
+    bool use_fixed_values, int value, int num_runs /*= kMinimumViableRuns*/) {
+  // There's no meaning testing fixed input in compound convolve.
+  if (is_compound_ && use_fixed_values) return;
+
+  // The compound function is only used for blocks 4x4 or greater.
+  if (is_compound_) {
+    if (param_.width < 4 || param_.height < 4) {
+      GTEST_SKIP();
+    }
+  }
+
+  // Skip unspecialized functions.
+  if (cur_convolve_scale_func_ == nullptr) {
+    GTEST_SKIP();
+  }
+
+  SetInputData(use_fixed_values, value);
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                             GetDigestId());
+  // [1,2048] for |step_[xy]|. This covers a scaling range of 1/1024 to 2x.
+  const int step_x = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+  const int step_y = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+  int subpixel_x = 0;
+  int subpixel_y = 0;
+  int vertical_index = 0;
+  int horizontal_index = 0;
+  const int offset =
+      kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+  const int offset_scale =
+      kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+  const Pixel* const src_scale = source_ + offset_scale;
+  const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+  const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+  // Pack Compound output since we control the predictor buffer.
+  const ptrdiff_t dst_stride_compound = param_.width;
+
+  // Output is always 16 bits regardless of |bitdepth|.
+  uint16_t* dst_16 = dest_16bit_ + offset;
+  // Output depends on |bitdepth|.
+  Pixel* dst_pixel = dest_clipped_ + offset;
+
+  // Collect the first |kMinimumViableRuns| into one md5 buffer.
+  libvpx_test::MD5 md5_digest;
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    // Test every filter.
+    // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+    subpixel_x += 1 << 6;
+    subpixel_y += 1 << 6;
+
+    const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+    const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+    // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+    // function.
+    if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+      continue;
+    }
+
+    // For focused speed testing these can be set to the desired filter. Want
+    // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+    vertical_index += static_cast<int>(i % 16 == 0);
+    vertical_index %= 4;
+    horizontal_index += static_cast<int>(i % 16 == 0);
+    horizontal_index %= 4;
+
+    // Output type is uint16_t.
+    const absl::Time start = absl::Now();
+    if (is_compound_) {
+      cur_convolve_scale_func_(
+          source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+          step_y, param_.width, param_.height, dst_16, dst_stride_compound);
+    } else {
+      cur_convolve_scale_func_(
+          source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+          step_y, param_.width, param_.height, dst_pixel, dst_stride);
+    }
+    elapsed_time += absl::Now() - start;
+
+    // Only check the output for the first set. After that it's just repeated
+    // runs for speed timing.
+    if (i >= kMinimumViableRuns) continue;
+
+    // Convolve function does not clip the output. The clipping is applied
+    // later, but libaom clips the output. So we apply clipping to match
+    // libaom in tests.
+    if (is_compound_) {
+      const int single_round_offset = (1 << bitdepth) + (1 << (bitdepth - 1));
+      Pixel* dest_row = dest_clipped_;
+      for (int y = 0; y < kMaxBlockHeight; ++y) {
+        for (int x = 0; x < kMaxBlockWidth; ++x) {
+          dest_row[x] = static_cast<Pixel>(Clip3(
+              dest_16bit_[y * dst_stride_compound + x] - single_round_offset, 0,
+              (1 << bitdepth) - 1));
+        }
+        dest_row += kMaxBlockWidth;
+      }
+    }
+
+    if (is_compound_) {
+      Check16Bit(use_fixed_values, source_16bit_ + offset_scale, dst_16,
+                 &md5_digest);
+    } else {
+      Check(use_fixed_values, src_scale, dst_pixel, &md5_digest);
+    }
+  }
+
+  if (!use_fixed_values) {
+    // md5 sums are only calculated for random input.
+    const char* ref_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        ref_digest = GetConvolveScaleDigest8bpp(GetDigestId());
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        ref_digest = GetConvolveScaleDigest10bpp(GetDigestId());
+        break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+      case 12:
+        ref_digest = GetConvolveScaleDigest12bpp(GetDigestId());
+        break;
+#endif
+    }
+    ASSERT_NE(ref_digest, nullptr);
+
+    const auto elapsed_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+    printf("Mode Convolve%sScale2D[%25s]: %5d us MD5: %s\n",
+           is_compound_ ? "Compound" : "",
+           absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+           elapsed_time_us, md5_digest.Get());
+    EXPECT_STREQ(ref_digest, md5_digest.Get());
+  }
+}
+
+using ConvolveScaleTest8bpp = ConvolveScaleTest<8, uint8_t>;
+
+TEST_P(ConvolveScaleTest8bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, 255);
+}
+
+TEST_P(ConvolveScaleTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest8bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+//------------------------------------------------------------------------------
+const ConvolveTestParam kConvolveParam[] = {
+    ConvolveTestParam(ConvolveTestParam::kBlockSize2x2),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize2x4),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize4x2),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize4x4),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize4x8),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x2),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x4),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x8),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x16),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize16x8),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize16x16),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize16x32),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize32x16),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize32x32),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize32x64),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize64x32),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize64x64),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize64x128),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize128x64),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize128x128),
+};
+
+const ConvolveTypeParam kConvolveTypeParam[] = {
+    ConvolveTypeParam(false, false, false, false),
+    ConvolveTypeParam(false, false, false, true),
+    ConvolveTypeParam(false, false, true, false),
+    ConvolveTypeParam(false, false, true, true),
+    ConvolveTypeParam(false, true, false, false),
+    ConvolveTypeParam(false, true, false, true),
+    ConvolveTypeParam(false, true, true, false),
+    ConvolveTypeParam(false, true, true, true),
+    ConvolveTypeParam(true, false, false, false),
+    ConvolveTypeParam(true, false, false, true),
+    ConvolveTypeParam(true, false, true, false),
+    ConvolveTypeParam(true, false, true, true),
+    // This is left to ensure no function exists for |intra_block_copy| when
+    // |is_compound| is true; all combinations aren't necessary.
+    ConvolveTypeParam(true, true, false, false),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConvolveTest10bpp = ConvolveTest<10, uint16_t>;
+
+TEST_P(ConvolveTest10bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest10bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+using ConvolveScaleTest10bpp = ConvolveScaleTest<10, uint16_t>;
+
+TEST_P(ConvolveScaleTest10bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveScaleTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest10bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest10bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest10bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest10bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest10bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConvolveTest12bpp = ConvolveTest<12, uint16_t>;
+
+TEST_P(ConvolveTest12bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << 12) - 1);
+}
+
+TEST_P(ConvolveTest12bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest12bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+using ConvolveScaleTest12bpp = ConvolveScaleTest<12, uint16_t>;
+
+TEST_P(ConvolveScaleTest12bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << 12) - 1);
+}
+
+TEST_P(ConvolveScaleTest12bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest12bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest12bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest12bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc
new file mode 100644 (file)
index 0000000..ef83235
--- /dev/null
@@ -0,0 +1,121 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             const uint8_t weight_0, const uint8_t weight_1,
+                             const int width, const int height,
+                             void* LIBGAV1_RESTRICT const dest,
+                             const ptrdiff_t dest_stride) {
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // See warp.cc and convolve.cc for detailed prediction ranges.
+      // weight_0 + weight_1 = 16.
+      int res = pred_0[x] * weight_0 + pred_1[x] * weight_1;
+      res -= (bitdepth == 8) ? 0 : kCompoundOffset * 16;
+      dst[x] = static_cast<Pixel>(
+          Clip3(RightShiftWithRounding(res, inter_post_round_bits + 4), 0,
+                (1 << bitdepth) - 1));
+    } while (++x < width);
+
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void DistanceWeightedBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend.h b/src/dsp/distance_weighted_blend.h
new file mode 100644 (file)
index 0000000..1a782b6
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+#define LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/distance_weighted_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/distance_weighted_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc
new file mode 100644 (file)
index 0000000..0d6e1cd
--- /dev/null
@@ -0,0 +1,328 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 500000;
+
+constexpr int kQuantizedDistanceLookup[4][2] = {
+    {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+template <int bitdepth, typename Pixel>
+class DistanceWeightedBlendTest : public testing::TestWithParam<BlockSize>,
+                                  public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  DistanceWeightedBlendTest() = default;
+  ~DistanceWeightedBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    DistanceWeightedBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_func_ = dsp->distance_weighted_blend;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      DistanceWeightedBlendInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      DistanceWeightedBlendInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->distance_weighted_blend;
+  }
+
+ protected:
+  void Test(const char* digest, int num_tests);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+  const int width_ = kBlockWidthPixels[GetParam()];
+  const int height_ = kBlockHeightPixels[GetParam()];
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+      {};
+  dsp::DistanceWeightedBlendFunc base_func_;
+  dsp::DistanceWeightedBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlendTest<bitdepth, Pixel>::Test(const char* digest,
+                                                      int num_tests) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  PredType* src_1 = source1_;
+  PredType* src_2 = source2_;
+
+  const int index = rnd.Rand8() & 3;
+  const uint8_t weight_0 = kQuantizedDistanceLookup[index][0];
+  const uint8_t weight_1 = kQuantizedDistanceLookup[index][1];
+  // In libgav1, predictors have an offset which are later subtracted and
+  // clipped in distance weighted blending. Therefore we add the offset
+  // here to match libaom's implementation.
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      // distance_weighted_blend is applied to compound prediction values. This
+      // implies a range far exceeding that of pixel values.
+      // The ranges include kCompoundOffset in 10bpp and 12bpp.
+      // see: src/dsp/convolve.cc & src/dsp/warp.cc.
+      static constexpr int kCompoundPredictionRange[3][2] = {
+          // 8bpp
+          {-5132, 9212},
+          // 10bpp
+          {3988, 61532},
+          // 12bpp
+          {3974, 61559},
+      };
+      constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+      const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+      const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+      src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+    }
+    src_1 += width_;
+    src_2 += width_;
+  }
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_tests; ++i) {
+    const absl::Time start = absl::Now();
+    func_(source1_, source2_, weight_0, weight_1, width_, height_, dest_,
+          sizeof(Pixel) * kDestStride);
+    elapsed_time += absl::Now() - start;
+  }
+
+  test_utils::CheckMd5Digest("DistanceWeightedBlend", ToString(GetParam()),
+                             digest, dest_, sizeof(dest_), elapsed_time);
+}
+
+const BlockSize kTestParam[] = {
+    kBlock4x4,    kBlock4x8,     kBlock4x16,  kBlock8x4,   kBlock8x8,
+    kBlock8x16,   kBlock8x32,    kBlock16x4,  kBlock16x8,  kBlock16x16,
+    kBlock16x32,  kBlock16x64,   kBlock32x8,  kBlock32x16, kBlock32x32,
+    kBlock32x64,  kBlock64x16,   kBlock64x32, kBlock64x64, kBlock64x128,
+    kBlock128x64, kBlock128x128,
+};
+
+const char* GetDistanceWeightedBlendDigest8bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
+      "ebf389f724f8ab46a2cac895e4e073ca",
+      "09acd567b6b12c8cf8eb51d8b86eb4bf",
+      "57bb4d65695d8ec6752f2bd8686b64fd",
+      // 8xN
+      "270905ac76f9a2cba8a552eb0bf7c8c1",
+      "f0801c8574d2c271ef2bbea77a1d7352",
+      "e761b580e3312be33a227492a233ce72",
+      "ff214dab1a7e98e2285961d6421720c6",
+      // 16xN
+      "4f712609a36e817f9752326d58562ff8",
+      "14243f5c5f7c7104160c1f2cef0a0fbc",
+      "3ac3f3161b7c8dd8436b02abfdde104a",
+      "81a00b704e0e41a5dbe6436ac70c098d",
+      "af8fd02017c7acdff788be742d700baa",
+      // 32xN
+      "ee34332c66a6d6ed8ce64031aafe776c",
+      "b5e3d22bd2dbdb624c8b86a1afb5ce6d",
+      "607ffc22098d81b7e37a7bf62f4af5d3",
+      "3823dbf043b4682f56d5ca698e755ea5",
+      // 64xN
+      "4acf556b921956c2bc24659cd5128401",
+      "a298c544c9c3b27924b4c23cc687ea5a",
+      "539e2df267782ce61c70103b23b7d922",
+      "3b0cb2a0b5d384efee4d81401025bec1",
+      // 128xN
+      "8b56b636dd712c2f8d138badb7219991",
+      "8cfc8836908902b8f915639b7bff45b3",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest8bpp = DistanceWeightedBlendTest<8, uint8_t>;
+
+TEST_P(DistanceWeightedBlendTest8bpp, Blending) {
+  Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest8bpp, DISABLED_Speed) {
+  Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDistanceWeightedBlendDigest10bpp(const BlockSize block_size) {
+  static const char* const kDigests[] = {
+      // 4xN
+      "55f594b56e16d5c401274affebbcc3d3",
+      "69df14da4bb33a8f7d7087921008e919",
+      "1b61f33604c54015794198a13bfebf46",
+      // 8xN
+      "825a938185b152f7cf09bf1c0723ce2b",
+      "85ea315c51d979bc9b45834d6b40ec6f",
+      "92ebde208e8c39f7ec6de2de82182dbb",
+      "520f84716db5b43684dbb703806383fe",
+      // 16xN
+      "12ca23e3e2930005a0511646e8c83da4",
+      "6208694a6744f4a3906f58c1add670e3",
+      "a33d63889df989a3bbf84ff236614267",
+      "34830846ecb0572a98bbd192fed02b16",
+      "34bb2f79c0bd7f9a80691b8af597f2a8",
+      // 32xN
+      "fa97f2d0e3143f1f44d3ac018b0d696d",
+      "3df4a22456c9ab6ed346ab1b9750ae7d",
+      "6276a058b35c6131bc0c94a4b4a37ebc",
+      "9ca42da5d2d5eb339df03ae2c7a26914",
+      // 64xN
+      "800e692c520f99223bc24c1ac95a0166",
+      "818b6d20426585ef7fe844015a03aaf5",
+      "fb48691ccfff083e01d74826e88e613f",
+      "0bd350bc5bc604a224d77a5f5a422698",
+      // 128xN
+      "a130840813cd6bd69d09bcf5f8d0180f",
+      "6ece1846bea55e8f8f2ed7fbf73718de",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest10bpp = DistanceWeightedBlendTest<10, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest10bpp, Blending) {
+  Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest10bpp, DISABLED_Speed) {
+  Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDistanceWeightedBlendDigest12bpp(const BlockSize block_size) {
+  static const char* const kDigests[] = {
+      // 4xN
+      "e30bf8f5f294206ad1dd79bd10a20827",
+      "f0cfb60134562d9c5f2ec6ad106e01ef",
+      "ad0876244e1b769203266a9c75b74afc",
+      // 8xN
+      "5265b954479c15a80f427561c5f36ff4",
+      "7f157457d1671e4ecce7a0884e9e3f76",
+      "d2cef5cf217f2d1f787c8951b7fe7cb2",
+      "6d23059008adbbb84ac941c8b4968f5b",
+      // 16xN
+      "ae521a5656ed3440d1fa950c20d90a79",
+      "935bec0e12b5dd3e0c34b3de8ba51476",
+      "0334bafcdcd7ddddb673ded492bca25a",
+      "c5360f08d0be77c79dc19fb55a0c5fe0",
+      "c2d1e7a4244a8aaaac041aed0cefc148",
+      // 32xN
+      "ce7f3cf78ae4f836cf69763137f7f6a6",
+      "800e52ebb14d5831c047d391cd760f95",
+      "74aa2b412b42165f1967daf3042b4f17",
+      "140d4cc600944b629b1991e88a9fe97c",
+      // 64xN
+      "3d206f93229ee2cea5c5da4e0ac6445a",
+      "3d13028f8fffe79fd35752c0177291ca",
+      "e7a7669acb5979dc7b15a19eed09cd4c",
+      "599368f4971c203fc5fa32989fe8cb44",
+      // 128xN
+      "54b46af2e2c8d2081e26fa0315b4ffd7",
+      "602e769bb2104e78223e68e50e7e86a0",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest12bpp = DistanceWeightedBlendTest<12, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest12bpp, Blending) {
+  Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest12bpp, DISABLED_Speed) {
+  Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest12bpp,
+                         testing::ValuesIn(kTestParam));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+  return os << ToString(param);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
new file mode 100644 (file)
index 0000000..97a064f
--- /dev/null
@@ -0,0 +1,178 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/average_blend.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/convolve.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/film_grain.h"
+#include "src/dsp/intra_edge.h"
+#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
+#include "src/dsp/intrapred_directional.h"
+#include "src/dsp/intrapred_filter.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/dsp/loop_filter.h"
+#include "src/dsp/loop_restoration.h"
+#include "src/dsp/mask_blend.h"
+#include "src/dsp/motion_field_projection.h"
+#include "src/dsp/motion_vector_search.h"
+#include "src/dsp/obmc.h"
+#include "src/dsp/super_res.h"
+#include "src/dsp/warp.h"
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp_internal {
+
+void DspInit_C() {
+  dsp::AverageBlendInit_C();
+  dsp::CdefInit_C();
+  dsp::ConvolveInit_C();
+  dsp::DistanceWeightedBlendInit_C();
+  dsp::FilmGrainInit_C();
+  dsp::IntraEdgeInit_C();
+  dsp::IntraPredCflInit_C();
+  dsp::IntraPredDirectionalInit_C();
+  dsp::IntraPredFilterInit_C();
+  dsp::IntraPredInit_C();
+  dsp::IntraPredSmoothInit_C();
+  dsp::InverseTransformInit_C();
+  dsp::LoopFilterInit_C();
+  dsp::LoopRestorationInit_C();
+  dsp::MaskBlendInit_C();
+  dsp::MotionFieldProjectionInit_C();
+  dsp::MotionVectorSearchInit_C();
+  dsp::ObmcInit_C();
+  dsp::SuperResInit_C();
+  dsp::WarpInit_C();
+  dsp::WeightMaskInit_C();
+}
+
+dsp::Dsp* GetWritableDspTable(int bitdepth) {
+  switch (bitdepth) {
+    case 8: {
+      static dsp::Dsp dsp_8bpp;
+      return &dsp_8bpp;
+    }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    case 10: {
+      static dsp::Dsp dsp_10bpp;
+      return &dsp_10bpp;
+    }
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+    case 12: {
+      static dsp::Dsp dsp_12bpp;
+      return &dsp_12bpp;
+    }
+#endif
+  }
+  return nullptr;
+}
+
+}  // namespace dsp_internal
+
+namespace dsp {
+
+void DspInit() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    dsp_internal::DspInit_C();
+#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+    const uint32_t cpu_features = GetCpuInfo();
+#if LIBGAV1_ENABLE_SSE4_1
+    if ((cpu_features & kSSE4_1) != 0) {
+      AverageBlendInit_SSE4_1();
+      CdefInit_SSE4_1();
+      ConvolveInit_SSE4_1();
+      DistanceWeightedBlendInit_SSE4_1();
+      FilmGrainInit_SSE4_1();
+      IntraEdgeInit_SSE4_1();
+      IntraPredCflInit_SSE4_1();
+      IntraPredDirectionalInit_SSE4_1();
+      IntraPredFilterInit_SSE4_1();
+      IntraPredInit_SSE4_1();
+      IntraPredCflInit_SSE4_1();
+      IntraPredSmoothInit_SSE4_1();
+      InverseTransformInit_SSE4_1();
+      LoopFilterInit_SSE4_1();
+      LoopRestorationInit_SSE4_1();
+      MaskBlendInit_SSE4_1();
+      MotionFieldProjectionInit_SSE4_1();
+      MotionVectorSearchInit_SSE4_1();
+      ObmcInit_SSE4_1();
+      SuperResInit_SSE4_1();
+      WarpInit_SSE4_1();
+      WeightMaskInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_SSE4_1();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_AVX2
+    if ((cpu_features & kAVX2) != 0) {
+      CdefInit_AVX2();
+      ConvolveInit_AVX2();
+      LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_AVX2();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif  // LIBGAV1_ENABLE_AVX2
+#endif  // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+#if LIBGAV1_ENABLE_NEON
+    AverageBlendInit_NEON();
+    CdefInit_NEON();
+    ConvolveInit_NEON();
+    DistanceWeightedBlendInit_NEON();
+    FilmGrainInit_NEON();
+    IntraEdgeInit_NEON();
+    IntraPredCflInit_NEON();
+    IntraPredDirectionalInit_NEON();
+    IntraPredFilterInit_NEON();
+    IntraPredInit_NEON();
+    IntraPredSmoothInit_NEON();
+    InverseTransformInit_NEON();
+    LoopFilterInit_NEON();
+    LoopRestorationInit_NEON();
+    MaskBlendInit_NEON();
+    MotionFieldProjectionInit_NEON();
+    MotionVectorSearchInit_NEON();
+    ObmcInit_NEON();
+    SuperResInit_NEON();
+    WarpInit_NEON();
+    WeightMaskInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    ConvolveInit10bpp_NEON();
+    InverseTransformInit10bpp_NEON();
+    LoopFilterInit10bpp_NEON();
+    LoopRestorationInit10bpp_NEON();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#endif  // LIBGAV1_ENABLE_NEON
+  });
+}
+
+const Dsp* GetDspTable(int bitdepth) {
+  return dsp_internal::GetWritableDspTable(bitdepth);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
new file mode 100644 (file)
index 0000000..f9e6b22
--- /dev/null
@@ -0,0 +1,963 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DSP_H_
+#define LIBGAV1_SRC_DSP_DSP_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+
+#if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS)
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0
+#endif
+
+enum IntraPredictor : uint8_t {
+  kIntraPredictorDcFill,
+  kIntraPredictorDcTop,
+  kIntraPredictorDcLeft,
+  kIntraPredictorDc,
+  kIntraPredictorVertical,
+  kIntraPredictorHorizontal,
+  kIntraPredictorPaeth,
+  kIntraPredictorSmooth,
+  kIntraPredictorSmoothVertical,
+  kIntraPredictorSmoothHorizontal,
+  kNumIntraPredictors
+};
+
+// List of valid 1D transforms.
+enum Transform1d : uint8_t {
+  kTransform1dDct,   // Discrete Cosine Transform.
+  kTransform1dAdst,  // Asymmetric Discrete Sine Transform.
+  kTransform1dIdentity,
+  kTransform1dWht,  // Walsh Hadamard Transform.
+  kNumTransform1ds
+};
+
+// List of valid 1D transform sizes. Not all transforms may be available for all
+// the sizes.
+enum Transform1dSize : uint8_t {
+  kTransform1dSize4,
+  kTransform1dSize8,
+  kTransform1dSize16,
+  kTransform1dSize32,
+  kTransform1dSize64,
+  kNumTransform1dSizes
+};
+
+// The maximum width of the loop filter, fewer pixels may be filtered depending
+// on strength thresholds.
+enum LoopFilterSize : uint8_t {
+  kLoopFilterSize4,
+  kLoopFilterSize6,
+  kLoopFilterSize8,
+  kLoopFilterSize14,
+  kNumLoopFilterSizes
+};
+
+enum : uint8_t {
+  kRow = 0,
+  kColumn = 1,
+};
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const IntraPredictor predictor) {
+  switch (predictor) {
+    case kIntraPredictorDcFill:
+      return "kIntraPredictorDcFill";
+    case kIntraPredictorDcTop:
+      return "kIntraPredictorDcTop";
+    case kIntraPredictorDcLeft:
+      return "kIntraPredictorDcLeft";
+    case kIntraPredictorDc:
+      return "kIntraPredictorDc";
+    case kIntraPredictorVertical:
+      return "kIntraPredictorVertical";
+    case kIntraPredictorHorizontal:
+      return "kIntraPredictorHorizontal";
+    case kIntraPredictorPaeth:
+      return "kIntraPredictorPaeth";
+    case kIntraPredictorSmooth:
+      return "kIntraPredictorSmooth";
+    case kIntraPredictorSmoothVertical:
+      return "kIntraPredictorSmoothVertical";
+    case kIntraPredictorSmoothHorizontal:
+      return "kIntraPredictorSmoothHorizontal";
+    case kNumIntraPredictors:
+      return "kNumIntraPredictors";
+  }
+  abort();
+}
+
+inline const char* ToString(const Transform1d transform) {
+  switch (transform) {
+    case kTransform1dDct:
+      return "kTransform1dDct";
+    case kTransform1dAdst:
+      return "kTransform1dAdst";
+    case kTransform1dIdentity:
+      return "kTransform1dIdentity";
+    case kTransform1dWht:
+      return "kTransform1dWht";
+    case kNumTransform1ds:
+      return "kNumTransform1ds";
+  }
+  abort();
+}
+
+inline const char* ToString(const Transform1dSize transform_size) {
+  switch (transform_size) {
+    case kTransform1dSize4:
+      return "kTransform1dSize4";
+    case kTransform1dSize8:
+      return "kTransform1dSize8";
+    case kTransform1dSize16:
+      return "kTransform1dSize16";
+    case kTransform1dSize32:
+      return "kTransform1dSize32";
+    case kTransform1dSize64:
+      return "kTransform1dSize64";
+    case kNumTransform1dSizes:
+      return "kNumTransform1dSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopFilterSize filter_size) {
+  switch (filter_size) {
+    case kLoopFilterSize4:
+      return "kLoopFilterSize4";
+    case kLoopFilterSize6:
+      return "kLoopFilterSize6";
+    case kLoopFilterSize8:
+      return "kLoopFilterSize8";
+    case kLoopFilterSize14:
+      return "kLoopFilterSize14";
+    case kNumLoopFilterSizes:
+      return "kNumLoopFilterSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopFilterType filter_type) {
+  switch (filter_type) {
+    case kLoopFilterTypeVertical:
+      return "kLoopFilterTypeVertical";
+    case kLoopFilterTypeHorizontal:
+      return "kLoopFilterTypeHorizontal";
+    case kNumLoopFilterTypes:
+      return "kNumLoopFilterTypes";
+  }
+  abort();
+}
+
+//------------------------------------------------------------------------------
+// Intra predictors. Section 7.11.2.
+// These require access to one or both of the top row and left column. Some may
+// access the top-left (top[-1]), top-right (top[width+N]), bottom-left
+// (left[height+N]) or upper-left (left[-1]).
+
+// Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11),
+// 7.11.2.5, 7.11.2.6.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. top-left and bottom-left may be accessed.
+// The pointer arguments do not alias one another.
+using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+                                    const void* top, const void* left);
+using IntraPredictorFuncs =
+    IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors];
+
+// Directional intra predictor function signature, zone 1 (0 < angle < 90).
+// Section 7.11.2.4 (#7).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |width| and |height| give the dimensions of the block.
+// |xstep| is the scaled starting index to |top| from
+// kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether
+// |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. top-right
+// is accessed.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
+                                                    const void* top, int width,
+                                                    int height, int xstep,
+                                                    bool upsampled_top);
+
+// Directional intra predictor function signature, zone 2 (90 < angle < 180).
+// Section 7.11.2.4 (#8).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left of
+// |dst|. |width| and |height| give the dimensions of the block. |xstep| and
+// |ystep| are the scaled starting index to |top| and |left|, respectively,
+// from kDirectionalIntraPredictorDerivative. |upsampled_top| and
+// |upsampled_left| indicate whether |top| and |left| have been upsampled as
+// described in '7.11.2.11. Intra edge upsample process'. This can occur in
+// cases with |width| + |height| <= 16. top-left and upper-left are accessed,
+// up to [-2] in each if |upsampled_top/left| are set.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone2Func = void (*)(
+    void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
+    int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
+
+// Directional intra predictor function signature, zone 3 (180 < angle < 270).
+// Section 7.11.2.4 (#9).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |left| is an aligned vector of the
+// column to the left of |dst|. |width| and |height| give the dimensions of the
+// block. |ystep| is the scaled starting index to |left| from
+// kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether
+// |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. bottom-left
+// is accessed.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
+                                                    const void* left, int width,
+                                                    int height, int ystep,
+                                                    bool upsampled_left);
+
+// Filter intra predictor function signature. Section 7.11.2.3.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. |width| and |height| are the size of the block in pixels.
+// The pointer arguments do not alias one another.
+using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+                                          const void* top, const void* left,
+                                          FilterIntraPredictor pred, int width,
+                                          int height);
+
+//------------------------------------------------------------------------------
+// Chroma from Luma (Cfl) prediction. Section 7.11.5.
+
+// Chroma from Luma (Cfl) intra prediction function signature. |dst| is an
+// unaligned pointer to the output block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |luma| contains subsampled luma pixels with 3
+// fractional bits of precision. |alpha| is the signed Cfl alpha value for the
+// appropriate plane.
+using CflIntraPredictorFunc = void (*)(
+    void* dst, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha);
+using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes];
+
+// Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned
+// pointer to the output block. |src| is an unaligned pointer to the input
+// block. Pixel size is determined by bitdepth with |stride| given in bytes.
+using CflSubsamplerFunc =
+    void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+             int max_luma_width, int max_luma_height, const void* source,
+             ptrdiff_t stride);
+using CflSubsamplerFuncs =
+    CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes];
+
+//------------------------------------------------------------------------------
+// Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4.
+
+// Intra edge filter function signature. |buffer| is a pointer to the top_row or
+// left_column that needs to be filtered. Typically the -1'th index of |top_row|
+// and |left_column| need to be filtered as well, so the caller can merely pass
+// the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be filtered. |strength| is the
+// filter strength. Section 7.11.2.12 in the spec.
+using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength);
+
+// Intra edge upsampler function signature. |buffer| is a pointer to the top_row
+// or left_column that needs to be upsampled. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be upsampled; valid values are:
+// 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of
+// the |buffer|. Section 7.11.2.11 in the spec.
+using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
+
+//------------------------------------------------------------------------------
+// Inverse transform add function signature.
+//
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the destination frame
+// for the transform type and block size |tx_size| starting at position
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D of Pixel
+// values. |adjusted_tx_height| is the number of rows to process based on the
+// non-zero coefficient count in the block. It will be 1 (non-zero coefficient
+// count == 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less. |src_buffer| is a pointer to an Array2D of Residual
+// values. On input |src_buffer| contains the dequantized values, on output it
+// contains the residual.
+// The pointer arguments do not alias one another.
+using InverseTransformAddFunc = void (*)(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* src_buffer, int start_x,
+                                         int start_y, void* dst_frame);
+// The final dimension holds row and column transforms indexed with kRow and
+// kColumn.
+using InverseTransformAddFuncs =
+    InverseTransformAddFunc[kNumTransform1ds][kNumTransform1dSizes][2];
+
+//------------------------------------------------------------------------------
+// Post processing.
+
+// Loop filter function signature. Section 7.14.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes.
+// <threshold param> <spec name> <range>
+// |outer_thresh|    blimit      [7, 193]
+// |inner_thresh|    limit       [1, 63]
+// |hev_thresh|      thresh      [0, 63]
+// These are scaled by the implementation by 'bitdepth - 8' to produce
+// the spec variables blimitBd, limitBd and threshBd.
+// Note these functions are not called when the loop filter level is 0.
+using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
+                                int inner_thresh, int hev_thresh);
+using LoopFilterFuncs =
+    LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes];
+
+// Cdef direction function signature. Section 7.15.2.
+// |src| is a pointer to the source block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |direction| and |variance| are output
+// parameters and must not be nullptr.
+// The pointer arguments do not alias one another.
+using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
+                                   uint8_t* direction, int* variance);
+
+// Cdef filtering function signature. Section 7.15.3.
+// |source| is a pointer to the input block padded with kCdefLargeValue if at a
+// frame border. |source_stride| is given in units of uint16_t.
+// |block_width|, |block_height| are the width/height of the input block.
+// |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
+// parameters.
+// |direction| is the filtering direction.
+// |dest| is the output buffer. |dest_stride| is given in bytes.
+// The pointer arguments do not alias one another.
+using CdefFilteringFunc = void (*)(const uint16_t* source,
+                                   ptrdiff_t source_stride, int block_height,
+                                   int primary_strength, int secondary_strength,
+                                   int damping, int direction, void* dest,
+                                   ptrdiff_t dest_stride);
+
+// The first index is block width: [0]: 4, [1]: 8. The second is based on
+// non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]:
+// |primary_strength| only, [2]: |secondary_strength| only.
+using CdefFilteringFuncs = CdefFilteringFunc[2][3];
+
+// Upscaling coefficients function signature. Section 7.16.
+// This is an auxiliary function for SIMD optimizations and has no corresponding
+// C function. Different SIMD versions may have different outputs. So it must
+// pair with the corresponding version of SuperResFunc.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// |coefficients| is the upscale filter used by each pixel in a row.
+using SuperResCoefficientsFunc = void (*)(int upscaled_width,
+                                          int initial_subpixel_x, int step,
+                                          void* coefficients);
+
+// Upscaling process function signature. Section 7.16.
+// |coefficients| is the upscale filter used by each pixel in a row. It is not
+// used by the C function.
+// |source| is the input frame buffer. It will be line extended.
+// |source_stride| is given in pixels.
+// |dest| is the output buffer.
+// |dest_stride| is given in pixels.
+// |height| is the height of the block to be processed.
+// |downscaled_width| is the width of the input frame.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// The pointer arguments do not alias one another.
+using SuperResFunc = void (*)(const void* coefficients, void* source,
+                              ptrdiff_t source_stride, int height,
+                              int downscaled_width, int upscaled_width,
+                              int initial_subpixel_x, int step, void* dest,
+                              ptrdiff_t dest_stride);
+
+// Loop restoration function signature. Sections 7.16, 7.17.
+// |restoration_info| contains loop restoration information, such as filter
+// type, strength.
+// |source| is the input frame buffer, which is deblocked and cdef filtered.
+// |top_border| and |bottom_border| are the top and bottom borders.
+// |dest| is the output.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |top_border_stride| and |bottom_border_stride| are given in pixels.
+// |restoration_buffer| contains buffers required for self guided filter and
+// wiener filter. They must be initialized before calling.
+// The pointer arguments do not alias one another.
+using LoopRestorationFunc = void (*)(
+    const RestorationUnitInfo& restoration_info, const void* source,
+    ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
+    const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
+    int height, RestorationBuffer* restoration_buffer, void* dest);
+
+// Index 0 is Wiener Filter.
+// Index 1 is Self Guided Restoration Filter.
+// This can be accessed as LoopRestorationType - 2.
+using LoopRestorationFuncs = LoopRestorationFunc[2];
+
+// Convolve function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+// The pointer arguments do not alias one another.
+using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
+                              int horizontal_filter_index,
+                              int vertical_filter_index,
+                              int horizontal_filter_id, int vertical_filter_id,
+                              int width, int height, void* prediction,
+                              ptrdiff_t pred_stride);
+
+// Convolve functions signature. Each points to one convolve function with
+// a specific setting:
+// ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter]
+// [has_horizontal_filter].
+// If is_compound is false, the prediction is clipped to Pixel.
+// If is_compound is true, the range of prediction is:
+//   8bpp:  [-5132,  9212] (int16_t)
+//   10bpp: [ 3988, 61532] (uint16_t)
+//   12bpp: [ 3974, 61559] (uint16_t)
+// See src/dsp/convolve.cc
+using ConvolveFuncs = ConvolveFunc[2][2][2][2];
+
+// Convolve + scale function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
+// |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+// The pointer arguments do not alias one another.
+using ConvolveScaleFunc = void (*)(const void* reference,
+                                   ptrdiff_t reference_stride,
+                                   int horizontal_filter_index,
+                                   int vertical_filter_index, int subpixel_x,
+                                   int subpixel_y, int step_x, int step_y,
+                                   int width, int height, void* prediction,
+                                   ptrdiff_t pred_stride);
+
+// Convolve functions signature for scaling version.
+// 0: single predictor. 1: compound predictor.
+using ConvolveScaleFuncs = ConvolveScaleFunc[2];
+
+// Weight mask function signature. Section 7.11.3.12.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the prediction width and height.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |mask| is the output buffer. |mask_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using WeightMaskFunc = void (*)(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride);
+
+// Weight mask functions signature. The dimensions (in order) are:
+//   * Width index (4 => 0, 8 => 1, 16 => 2 and so on).
+//   * Height index (4 => 0, 8 => 1, 16 => 2 and so on).
+//   * mask_is_inverse.
+using WeightMaskFuncs = WeightMaskFunc[6][6][2];
+
+// Average blending function signature.
+// Two predictors are averaged to generate the output.
+// Input predictor values are int16_t. Output type is uint8_t, with actual
+// range of Pixel value.
+// Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE).
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using AverageBlendFunc = void (*)(const void* prediction_0,
+                                  const void* prediction_1, int width,
+                                  int height, void* dest,
+                                  ptrdiff_t dest_stride);
+
+// Distance weighted blending function signature.
+// Weights are generated in Section 7.11.3.15.
+// Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE).
+// This function takes two blocks (inter frame prediction) and produces a
+// weighted output.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |weight_0| is the weight for the first block. It is derived from the relative
+// distance of the first reference frame and the current frame.
+// |weight_1| is the weight for the second block. It is derived from the
+// relative distance of the second reference frame and the current frame.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
+                                           const void* prediction_1,
+                                           uint8_t weight_0, uint8_t weight_1,
+                                           int width, int height, void* dest,
+                                           ptrdiff_t dest_stride);
+
+// Mask blending function signature. Section 7.11.3.14.
+// This function takes two blocks and produces a blended output stored into the
+// output block |dest|. The blending is a weighted average process, controlled
+// by values of the mask.
+// |prediction_0| is the first input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the inter frame prediction. It is
+// int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// The stride for |prediction_0| is equal to |width|.
+// |prediction_1| is the second input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the intra frame prediction and uses
+// Pixel values. It is only used for intra frame prediction when bitdepth >= 10.
+// It is int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// |prediction_stride_1| is the stride, given in units of [u]int16_t. When
+// |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is
+// equal to |width|.
+// |mask| is an integer array, whose value indicates the weight of the blending.
+// |mask_stride| is corresponding stride.
+// |width|, |height| are the same for both input blocks.
+// If it's inter_intra (or wedge_inter_intra), the valid range of block size is
+// [8x8, 32x32], no 4:1/1:4 blocks (Section 5.11.28). Otherwise (including
+// difference weighted prediction and compound average prediction), the valid
+// range is [8x8, 128x128].
+// If there's subsampling, the corresponding width and height are halved for
+// chroma planes.
+// |is_inter_intra| stands for the prediction mode. If it is true, one of the
+// prediction blocks is from intra prediction of current frame. Otherwise, two
+// prediction blocks are both inter frame predictions.
+// |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
+// |dest| is the output block.
+// |dest_stride| is the corresponding stride for dest.
+// The pointer arguments do not alias one another.
+using MaskBlendFunc = void (*)(const void* prediction_0,
+                               const void* prediction_1,
+                               ptrdiff_t prediction_stride_1,
+                               const uint8_t* mask, ptrdiff_t mask_stride,
+                               int width, int height, void* dest,
+                               ptrdiff_t dest_stride);
+
+// Mask blending functions signature. Each points to one function with
+// a specific setting:
+// MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra].
+using MaskBlendFuncs = MaskBlendFunc[3][2];
+
+// This function is similar to the MaskBlendFunc. It is only used when
+// |is_inter_intra| is true and |bitdepth| == 8.
+// |prediction_[01]| are Pixel values (uint8_t).
+// |prediction_1| is also the output buffer.
+// The pointer arguments do not alias one another.
+using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
+                                             uint8_t* prediction_1,
+                                             ptrdiff_t prediction_stride_1,
+                                             const uint8_t* mask,
+                                             ptrdiff_t mask_stride, int width,
+                                             int height);
+
+// InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra
+// is false, the function at index 0 must be used. Otherwise, the function at
+// index subsampling_x + subsampling_y must be used.
+using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
+
+// Obmc (overlapped block motion compensation) blending function signature.
+// Section 7.11.3.10.
+// This function takes two blocks and produces a blended output stored into the
+// first input block. The blending is a weighted average process, controlled by
+// values of the mask.
+// Obmc is not a compound mode. It is different from other compound blending,
+// in terms of precision. The current block is computed using convolution with
+// clipping to the range of pixel values. Its above and left blocks are also
+// clipped. Therefore obmc blending process doesn't need to clip the output.
+// |prediction| is the first input block, which will be overwritten.
+// |prediction_stride| is the stride, given in bytes.
+// |width|, |height| are the same for both input blocks. The range is [4x2,
+// 32x32] for kObmcDirectionVertical and [2x4, 32x32] for
+// kObmcDirectionHorizontal, see Section 7.11.3.9.
+// |obmc_prediction| is the second input block.
+// |obmc_prediction_stride| is its stride, given in bytes.
+// The pointer arguments do not alias one another.
+using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
+                               int width, int height,
+                               const void* obmc_prediction,
+                               ptrdiff_t obmc_prediction_stride);
+using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
+
+// Warp function signature. Section 7.11.3.5.
+// This function applies warp filtering for each 8x8 block inside the current
+// coding block. The filtering process is similar to 2d convolve filtering.
+// The horizontal filter is applied followed by the vertical filter.
+// The function has to calculate corresponding pixel positions before and
+// after warping.
+// |source| is the input reference frame buffer.
+// |source_stride|, |source_width|, |source_height| are corresponding frame
+// stride, width, and height. |source_stride| is given in bytes.
+// |warp_params| is the matrix of warp motion: warp_params[i] = mN.
+//         [x'     (m2 m3 m0   [x
+//     z .  y'  =   m4 m5 m1 *  y
+//          1]      m6 m7 1)    1]
+// |subsampling_x/y| is the current frame's plane subsampling factor.
+// |block_start_x| and |block_start_y| are the starting position the current
+// coding block.
+// |block_width| and |block_height| are width and height of the current coding
+// block. |block_width| and |block_height| are at least 8.
+// |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the
+// comments in the definition of struct GlobalMotion for the range of their
+// values.
+// |dest| is the output buffer of type Pixel. The output values are clipped to
+// Pixel values.
+// |dest_stride| is the stride, in units of bytes.
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsVertical &
+// kInterRoundBitsVertical12bpp will be used.
+//
+// NOTE: WarpFunc assumes the source frame has left, right, top, and bottom
+// borders that extend the frame boundary pixels.
+// * The left and right borders must be at least 13 pixels wide. In addition,
+//   Warp_NEON() may read up to 14 bytes after a row in the |source| buffer.
+//   Therefore, there must be at least one extra padding byte after the right
+//   border of the last row in the source buffer.
+// * The top and bottom borders must be at least 13 pixels high.
+// The pointer arguments do not alias one another.
+using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
+                          int source_width, int source_height,
+                          const int* warp_params, int subsampling_x,
+                          int subsampling_y, int block_start_x,
+                          int block_start_y, int block_width, int block_height,
+                          int16_t alpha, int16_t beta, int16_t gamma,
+                          int16_t delta, void* dest, ptrdiff_t dest_stride);
+
+// Warp for compound predictions. Section 7.11.3.5.
+// Similar to WarpFunc, but |dest| is a uint16_t predictor buffer,
+// |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical|
+// is always 7 (kCompoundInterRoundBitsVertical).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsCompondVertical will be used.
+using WarpCompoundFunc = WarpFunc;
+
+constexpr int kNumAutoRegressionLags = 4;
+// Applies an auto-regressive filter to the white noise in |luma_grain_buffer|.
+// Section 7.18.3.3, second code block
+// |params| are parameters read from frame header, mainly providing
+// auto_regression_coeff_y for the filter and auto_regression_shift to right
+// shift the filter sum by. Note: This method assumes
+// params.auto_regression_coeff_lag is not 0. Do not call this method if
+// params.auto_regression_coeff_lag is 0.
+using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+                                        void* luma_grain_buffer);
+// Function index is auto_regression_coeff_lag - 1.
+using LumaAutoRegressionFuncs =
+    LumaAutoRegressionFunc[kNumAutoRegressionLags - 1];
+
+// Applies an auto-regressive filter to the white noise in u_grain and v_grain.
+// Section 7.18.3.3, third code block
+// The |luma_grain_buffer| provides samples that are added to the autoregressive
+// sum when num_y_points > 0.
+// |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise
+// that were generated from the stored Gaussian sequence, and are overwritten
+// with the results of the autoregressive filter. |params| are parameters read
+// from frame header, mainly providing auto_regression_coeff_u and
+// auto_regression_coeff_v for each chroma plane's filter, and
+// auto_regression_shift to right shift the filter sums by.
+// The pointer arguments do not alias one another.
+using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+                                          const void* luma_grain_buffer,
+                                          int subsampling_x, int subsampling_y,
+                                          void* u_grain_buffer,
+                                          void* v_grain_buffer);
+using ChromaAutoRegressionFuncs =
+    ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags];
+
+// Build an image-wide "stripe" of grain noise for every 32 rows in the image.
+// Section 7.18.3.5, first code block.
+// Each 32x32 luma block is copied at a random offset specified via
+// |grain_seed| from the grain template produced by autoregression, and the same
+// is done for chroma grains, subject to subsampling.
+// |width| and |height| are the dimensions of the overall image.
+// |noise_stripes_buffer| points to an Array2DView with one row for each stripe.
+// Because this function treats all planes identically and independently, it is
+// simplified to take one grain buffer at a time. This means duplicating some
+// random number generations, but that work can be reduced in other ways.
+// The pointer arguments do not alias one another.
+using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
+                                           int grain_seed, int width,
+                                           int height, int subsampling_x,
+                                           int subsampling_y,
+                                           void* noise_stripes_buffer);
+using ConstructNoiseStripesFuncs =
+    ConstructNoiseStripesFunc[/*overlap_flag*/ 2];
+
+// Compute the one or two overlap rows for each stripe copied to the noise
+// image.
+// Section 7.18.3.5, second code block. |width| and |height| are the
+// dimensions of the overall image. |noise_stripes_buffer| points to an
+// Array2DView with one row for each stripe. |noise_image_buffer| points to an
+// Array2D containing the allocated plane for this frame. Because this function
+// treats all planes identically and independently, it is simplified to take one
+// grain buffer at a time.
+// The pointer arguments do not alias one another.
+using ConstructNoiseImageOverlapFunc =
+    void (*)(const void* noise_stripes_buffer, int width, int height,
+             int subsampling_x, int subsampling_y, void* noise_image_buffer);
+
+// Populate a scaling lookup table with interpolated values of a piecewise
+// linear function where values in |point_value| are mapped to the values in
+// |point_scaling|.
+// |num_points| can be between 0 and 15. When 0, the lookup table is set to
+// zero.
+// |point_value| and |point_scaling| have |num_points| valid elements.
+// The pointer arguments do not alias one another.
+using InitializeScalingLutFunc = void (*)(int num_points,
+                                          const uint8_t point_value[],
+                                          const uint8_t point_scaling[],
+                                          int16_t* scaling_lut,
+                                          const int scaling_lut_length);
+
+// Blend noise with image. Section 7.18.3.5, third code block.
+// |width| is the width of each row, while |height| is how many rows to compute.
+// |start_height| is an offset for the noise image, to support multithreading.
+// |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these
+// functions, according to the code in the spec.
+// |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded
+// frame. They are blended with the film grain noise and written to
+// |dest_plane_y| and |dest_plane_uv| as final output for display.
+// source_plane_* and dest_plane_* may point to the same buffer, in which case
+// the film grain noise is added in place.
+// |scaling_lut_y|  and |scaling_lut| represent a piecewise linear mapping from
+// the frame's raw pixel value, to a scaling factor for the noise sample.
+// |scaling_shift| is applied as a right shift after scaling, so that scaling
+// down is possible. It is found in FilmGrainParams, but supplied directly to
+// BlendNoiseWithImageLumaFunc because it's the only member used.
+// The dest plane may point to the source plane, depending on the value of
+// frame_header.show_existing_frame. |noise_image_ptr| and scaling_lut.* do not
+// alias other arguments.
+using BlendNoiseWithImageLumaFunc = void (*)(
+    const void* noise_image_ptr, int min_value, int max_value,
+    int scaling_shift, int width, int height, int start_height,
+    const int16_t* scaling_lut_y, const void* source_plane_y,
+    ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y);
+
+using BlendNoiseWithImageChromaFunc = void (*)(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_value, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y, const int16_t* scaling_lut,
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv);
+
+using BlendNoiseWithImageChromaFuncs =
+    BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2];
+
+//------------------------------------------------------------------------------
+
+struct FilmGrainFuncs {
+  LumaAutoRegressionFuncs luma_auto_regression;
+  ChromaAutoRegressionFuncs chroma_auto_regression;
+  ConstructNoiseStripesFuncs construct_noise_stripes;
+  ConstructNoiseImageOverlapFunc construct_noise_image_overlap;
+  InitializeScalingLutFunc initialize_scaling_lut;
+  BlendNoiseWithImageLumaFunc blend_noise_luma;
+  BlendNoiseWithImageChromaFuncs blend_noise_chroma;
+};
+
+// Motion field projection function signature. Section 7.9.
+// |reference_info| provides reference information for motion field projection.
+// |reference_to_current_with_sign| is the precalculated reference frame id
+// distance from current frame.
+// |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
+// |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile.
+// |x8_start| and |x8_end| are the start and end 8x8 columns of the current
+// tile.
+// |motion_field| is the output which saves the projected motion field
+// information.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MotionFieldProjectionKernelFunc = void (*)(
+    const ReferenceInfo& reference_info, int reference_to_current_with_sign,
+    int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* motion_field);
+
+// Compound temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offsets| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MvProjectionCompoundFunc = void (*)(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], int count,
+    CompoundMotionVector* candidate_mvs);
+
+// Single temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offset| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MvProjectionSingleFunc = void (*)(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    int reference_offset, int count, MotionVector* candidate_mvs);
+
+struct Dsp {
+  AverageBlendFunc average_blend;
+  CdefDirectionFunc cdef_direction;
+  CdefFilteringFuncs cdef_filters;
+  CflIntraPredictorFuncs cfl_intra_predictors;
+  CflSubsamplerFuncs cfl_subsamplers;
+  ConvolveFuncs convolve;
+  ConvolveScaleFuncs convolve_scale;
+  DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
+  DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
+  DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
+  DistanceWeightedBlendFunc distance_weighted_blend;
+  FilmGrainFuncs film_grain;
+  FilterIntraPredictorFunc filter_intra_predictor;
+  InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
+  IntraEdgeFilterFunc intra_edge_filter;
+  IntraEdgeUpsamplerFunc intra_edge_upsampler;
+  IntraPredictorFuncs intra_predictors;
+  InverseTransformAddFuncs inverse_transforms;
+  LoopFilterFuncs loop_filters;
+  LoopRestorationFuncs loop_restorations;
+  MaskBlendFuncs mask_blend;
+  MotionFieldProjectionKernelFunc motion_field_projection_kernel;
+  MvProjectionCompoundFunc mv_projection_compound[3];
+  MvProjectionSingleFunc mv_projection_single[3];
+  ObmcBlendFuncs obmc_blend;
+  SuperResCoefficientsFunc super_res_coefficients;
+  SuperResFunc super_res;
+  WarpCompoundFunc warp_compound;
+  WarpFunc warp;
+  WeightMaskFuncs weight_mask;
+};
+
+// Initializes function pointers based on build config and runtime
+// environment. Must be called once before first use. This function is
+// thread-safe.
+void DspInit();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist.
+const Dsp* GetDspTable(int bitdepth);
+
+}  // namespace dsp
+
+namespace dsp_internal {
+
+// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
+// functions if /arch:AVX2 is used across all sources.
+#if !LIBGAV1_TARGETING_AVX2 && \
+    (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
+#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
+#endif
+
+// Returns true if a more highly optimized version of |func| is not defined for
+// the associated bitdepth or if it is forcibly enabled with
+// LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
+// to the LIBGAV1_Dsp<bitdepth>bpp_|func| define in the header file associated
+// with the module.
+// |func| is one of:
+//   - FunctionName, e.g., SelfGuidedFilter.
+//   - [sub-table-index1][...-indexN] e.g.,
+//     TransformSize4x4_IntraPredictorDc. The indices correspond to enum values
+//     used as lookups with leading 'k' removed.
+//
+//  NEON support is the only extension available for ARM and it is always
+//  required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
+//  true and can be omitted.
+#define DSP_ENABLED_8BPP_AVX2(func)    \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_10BPP_AVX2(func)   \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_8BPP_SSE4_1(func)  \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
+#define DSP_ENABLED_10BPP_SSE4_1(func) \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
+
+// Initializes C-only function pointers. Note some entries may be set to
+// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
+// for use in tests only, it is not thread-safe.
+void DspInit_C();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist. This version is meant for use by test or dsp/*Init() functions only.
+dsp::Dsp* GetWritableDspTable(int bitdepth);
+
+}  // namespace dsp_internal
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_DSP_H_
diff --git a/src/dsp/dsp_test.cc b/src/dsp/dsp_test.cc
new file mode 100644 (file)
index 0000000..6d2817b
--- /dev/null
@@ -0,0 +1,272 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#include "tests/utils.h"
+#endif
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Maps 1D transform to the maximum valid size for the corresponding transform.
+constexpr int kMaxTransform1dSize[kNumTransform1ds] = {
+    kTransform1dSize64,  // Dct.
+    kTransform1dSize16,  // Adst.
+    kTransform1dSize32,  // Identity.
+    kTransform1dSize4,   // Wht.
+};
+
+void CheckTables(bool c_only) {
+#if LIBGAV1_MAX_BITDEPTH == 12
+  static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10, kBitdepth12};
+#elif LIBGAV1_MAX_BITDEPTH >= 10
+  static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10};
+#else
+  static constexpr int kBitdepths[] = {kBitdepth8};
+#endif
+
+  for (const auto& bitdepth : kBitdepths) {
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    SCOPED_TRACE(absl::StrCat("bitdepth: ", bitdepth));
+    for (int i = 0; i < kNumTransformSizes; ++i) {
+      for (int j = 0; j < kNumIntraPredictors; ++j) {
+        EXPECT_NE(dsp->intra_predictors[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    EXPECT_NE(dsp->directional_intra_predictor_zone1, nullptr);
+    EXPECT_NE(dsp->directional_intra_predictor_zone2, nullptr);
+    EXPECT_NE(dsp->directional_intra_predictor_zone3, nullptr);
+    EXPECT_NE(dsp->filter_intra_predictor, nullptr);
+    for (int i = 0; i < kNumTransformSizes; ++i) {
+      if (std::max(kTransformWidth[i], kTransformHeight[i]) == 64) {
+        EXPECT_EQ(dsp->cfl_intra_predictors[i], nullptr)
+            << "index [" << i << "]";
+        for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+          EXPECT_EQ(dsp->cfl_subsamplers[i][j], nullptr)
+              << "index [" << i << "][" << j << "]";
+        }
+      } else {
+        EXPECT_NE(dsp->cfl_intra_predictors[i], nullptr)
+            << "index [" << i << "]";
+        for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+          EXPECT_NE(dsp->cfl_subsamplers[i][j], nullptr)
+              << "index [" << i << "][" << j << "]";
+        }
+      }
+    }
+    EXPECT_NE(dsp->intra_edge_filter, nullptr);
+    EXPECT_NE(dsp->intra_edge_upsampler, nullptr);
+    for (int i = 0; i < kNumTransform1ds; ++i) {
+      for (int j = 0; j < kNumTransform1dSizes; ++j) {
+        for (int k = 0; k < 2; ++k) {
+          if (j <= kMaxTransform1dSize[i]) {
+            EXPECT_NE(dsp->inverse_transforms[i][j][k], nullptr)
+                << "index [" << i << "][" << j << "][" << k << "]";
+          } else {
+            EXPECT_EQ(dsp->inverse_transforms[i][j][k], nullptr)
+                << "index [" << i << "][" << j << "][" << k << "]";
+          }
+        }
+      }
+    }
+    for (int i = 0; i < kNumLoopFilterSizes; ++i) {
+      for (int j = 0; j < kNumLoopFilterTypes; ++j) {
+        EXPECT_NE(dsp->loop_filters[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    for (int i = 0; i < 2; ++i) {
+      EXPECT_NE(dsp->loop_restorations[i], nullptr) << "index [" << i << "]";
+    }
+
+    bool super_res_coefficients_is_nonnull = LIBGAV1_ENABLE_NEON;
+#if LIBGAV1_ENABLE_SSE4_1
+    const uint32_t cpu_features = GetCpuInfo();
+    super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0;
+#endif
+    if (c_only || bitdepth == kBitdepth12) {
+      super_res_coefficients_is_nonnull = false;
+    }
+    if (super_res_coefficients_is_nonnull) {
+      EXPECT_NE(dsp->super_res_coefficients, nullptr);
+    } else {
+      EXPECT_EQ(dsp->super_res_coefficients, nullptr);
+    }
+
+    EXPECT_NE(dsp->super_res, nullptr);
+    EXPECT_NE(dsp->cdef_direction, nullptr);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        EXPECT_NE(dsp->cdef_filters[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    for (auto convolve_func : dsp->convolve_scale) {
+      EXPECT_NE(convolve_func, nullptr);
+    }
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          for (int m = 0; m < 2; ++m) {
+            if (j == 1 && k == 1) {
+              EXPECT_EQ(dsp->convolve[j][k][l][m], nullptr);
+            } else {
+              EXPECT_NE(dsp->convolve[j][k][l][m], nullptr);
+            }
+          }
+        }
+      }
+    }
+    for (const auto& m : dsp->mask_blend) {
+      for (int i = 0; i < 2; ++i) {
+        if (i == 0 || bitdepth >= 10) {
+          EXPECT_NE(m[i], nullptr);
+        } else {
+          EXPECT_EQ(m[i], nullptr);
+        }
+      }
+    }
+    for (const auto& m : dsp->inter_intra_mask_blend_8bpp) {
+      if (bitdepth == 8) {
+        EXPECT_NE(m, nullptr);
+      } else {
+        EXPECT_EQ(m, nullptr);
+      }
+    }
+    for (int i = kBlock4x4; i < kMaxBlockSizes; ++i) {
+      const int width_index = k4x4WidthLog2[i] - 1;
+      const int height_index = k4x4HeightLog2[i] - 1;
+      // Only block sizes >= 8x8 are handled with this function.
+      if (width_index < 0 || height_index < 0) continue;
+
+      for (size_t j = 0; j < 2; ++j) {
+        EXPECT_NE(dsp->weight_mask[width_index][height_index][j], nullptr)
+            << ToString(static_cast<BlockSize>(i)) << " index [" << width_index
+            << "]"
+            << "[" << height_index << "][" << j << "]";
+      }
+    }
+
+    EXPECT_NE(dsp->average_blend, nullptr);
+    EXPECT_NE(dsp->distance_weighted_blend, nullptr);
+    for (int i = 0; i < kNumObmcDirections; ++i) {
+      EXPECT_NE(dsp->obmc_blend[i], nullptr)
+          << "index [" << ToString(static_cast<ObmcDirection>(i)) << "]";
+    }
+    EXPECT_NE(dsp->warp, nullptr);
+    EXPECT_NE(dsp->warp_compound, nullptr);
+
+    for (int i = 0; i < kNumAutoRegressionLags - 1; ++i) {
+      EXPECT_NE(dsp->film_grain.luma_auto_regression[i], nullptr)
+          << "index [" << i << "]";
+    }
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < kNumAutoRegressionLags; ++j) {
+        if (i == 0 && j == 0) {
+          EXPECT_EQ(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+              << " index [" << i << "]"
+              << "[" << j << "]";
+        } else {
+          EXPECT_NE(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+              << " index [" << i << "]"
+              << "[" << j << "]";
+        }
+      }
+      EXPECT_NE(dsp->film_grain.construct_noise_stripes[i], nullptr)
+          << "index [" << i << "]";
+      EXPECT_NE(dsp->film_grain.blend_noise_chroma[i], nullptr)
+          << "index [" << i << "]";
+    }
+    EXPECT_NE(dsp->film_grain.construct_noise_image_overlap, nullptr);
+    EXPECT_NE(dsp->film_grain.initialize_scaling_lut, nullptr);
+    EXPECT_NE(dsp->film_grain.blend_noise_luma, nullptr);
+
+    if (bitdepth == 8) {
+      EXPECT_NE(dsp->motion_field_projection_kernel, nullptr);
+      EXPECT_NE(dsp->mv_projection_compound[0], nullptr);
+      EXPECT_NE(dsp->mv_projection_compound[1], nullptr);
+      EXPECT_NE(dsp->mv_projection_compound[2], nullptr);
+      EXPECT_NE(dsp->mv_projection_single[0], nullptr);
+      EXPECT_NE(dsp->mv_projection_single[1], nullptr);
+      EXPECT_NE(dsp->mv_projection_single[2], nullptr);
+    } else {
+      EXPECT_EQ(dsp->motion_field_projection_kernel, nullptr);
+      EXPECT_EQ(dsp->mv_projection_compound[0], nullptr);
+      EXPECT_EQ(dsp->mv_projection_compound[1], nullptr);
+      EXPECT_EQ(dsp->mv_projection_compound[2], nullptr);
+      EXPECT_EQ(dsp->mv_projection_single[0], nullptr);
+      EXPECT_EQ(dsp->mv_projection_single[1], nullptr);
+      EXPECT_EQ(dsp->mv_projection_single[2], nullptr);
+    }
+  }
+}
+
+TEST(Dsp, TablesArePopulated) {
+  DspInit();
+  CheckTables(/*c_only=*/false);
+}
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+TEST(Dsp, TablesArePopulatedCOnly) {
+  test_utils::ResetDspTable(kBitdepth8);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  test_utils::ResetDspTable(kBitdepth10);
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  test_utils::ResetDspTable(kBitdepth12);
+#endif
+  dsp_internal::DspInit_C();
+  CheckTables(/*c_only=*/true);
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+TEST(Dsp, GetDspTable) {
+  EXPECT_EQ(GetDspTable(1), nullptr);
+  EXPECT_NE(GetDspTable(kBitdepth8), nullptr);
+  EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr);
+  EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth8), nullptr);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  EXPECT_NE(GetDspTable(kBitdepth10), nullptr);
+  EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr);
+#else
+  EXPECT_EQ(GetDspTable(kBitdepth10), nullptr);
+  EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr);
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  EXPECT_NE(GetDspTable(kBitdepth12), nullptr);
+  EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr);
+#else
+  EXPECT_EQ(GetDspTable(kBitdepth12), nullptr);
+  EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr);
+#endif
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc
new file mode 100644 (file)
index 0000000..906230d
--- /dev/null
@@ -0,0 +1,997 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+template <int bitdepth>
+void InitializeScalingLookupTable_C(int num_points, const uint8_t point_value[],
+                                    const uint8_t point_scaling[],
+                                    int16_t* scaling_lut,
+                                    const int scaling_lut_length) {
+  if (num_points == 0) {
+    memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
+    return;
+  }
+  constexpr int index_shift = (bitdepth == kBitdepth10) ? 2 : 0;
+  static_assert(sizeof(scaling_lut[0]) == 2, "");
+  Memset(scaling_lut, point_scaling[0],
+         std::max(static_cast<int>(point_value[0]), 1) << index_shift);
+  for (int i = 0; i < num_points - 1; ++i) {
+    const int delta_y = point_scaling[i + 1] - point_scaling[i];
+    const int delta_x = point_value[i + 1] - point_value[i];
+    const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+    for (int x = 0; x < delta_x; ++x) {
+      const int v = point_scaling[i] + ((x * delta + 32768) >> 16);
+      assert(v >= 0 && v <= UINT8_MAX);
+      const int lut_index = (point_value[i] + x) << index_shift;
+      scaling_lut[lut_index] = v;
+    }
+  }
+  const int16_t last_point_value = point_value[num_points - 1];
+  const int x_base = last_point_value << index_shift;
+  Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
+         scaling_lut_length - x_base);
+  // Fill in the gaps.
+  if (bitdepth == kBitdepth10) {
+    for (int x = 4; x < x_base + 4; x += 4) {
+      const int start = scaling_lut[x - 4];
+      const int end = scaling_lut[x];
+      const int delta = end - start;
+      scaling_lut[x - 3] = start + RightShiftWithRounding(delta, 2);
+      scaling_lut[x - 2] = start + RightShiftWithRounding(2 * delta, 2);
+      scaling_lut[x - 1] = start + RightShiftWithRounding(3 * delta, 2);
+    }
+  }
+}
+
+// Section 7.18.3.5.
+template <int bitdepth>
+int ScaleLut(const int16_t* scaling_lut, int index) {
+  if (bitdepth <= kBitdepth10) {
+    assert(index < kScalingLookupTableSize << (bitdepth - 2));
+    return scaling_lut[index];
+  }
+  // Performs a piecewise linear interpolation into the scaling table.
+  const int shift = bitdepth - kBitdepth8;
+  const int quotient = index >> shift;
+  const int remainder = index - (quotient << shift);
+  assert(quotient + 1 < kScalingLookupTableSize);
+  const int start = scaling_lut[quotient];
+  const int end = scaling_lut[quotient + 1];
+  return start + RightShiftWithRounding((end - start) * remainder, shift);
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType>
+void ApplyAutoRegressiveFilterToLumaGrain_C(const FilmGrainParams& params,
+                                            void* luma_grain_buffer) {
+  auto* luma_grain = static_cast<GrainType*>(luma_grain_buffer);
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int auto_regression_coeff_lag = params.auto_regression_coeff_lag;
+  assert(auto_regression_coeff_lag > 0 && auto_regression_coeff_lag <= 3);
+  // A pictorial representation of the auto-regressive filter for various values
+  // of auto_regression_coeff_lag. The letter 'O' represents the current sample.
+  // (The filter always operates on the current sample with filter
+  // coefficient 1.) The letters 'X' represent the neighboring samples that the
+  // filter operates on.
+  //
+  // auto_regression_coeff_lag == 3:
+  //   X X X X X X X
+  //   X X X X X X X
+  //   X X X X X X X
+  //   X X X O
+  // auto_regression_coeff_lag == 2:
+  //     X X X X X
+  //     X X X X X
+  //     X X O
+  // auto_regression_coeff_lag == 1:
+  //       X X X
+  //       X O
+  // auto_regression_coeff_lag == 0:
+  //         O
+  //
+  // Note that if auto_regression_coeff_lag is 0, the filter is the identity
+  // filter and therefore can be skipped. This implementation assumes it is not
+  // called in that case.
+  const int shift = params.auto_regression_shift;
+  for (int y = kAutoRegressionBorder; y < kLumaHeight; ++y) {
+    for (int x = kAutoRegressionBorder; x < kLumaWidth - kAutoRegressionBorder;
+         ++x) {
+      int sum = 0;
+      int pos = 0;
+      int delta_row = -auto_regression_coeff_lag;
+      // The last iteration (delta_row == 0) is shorter and is handled
+      // separately.
+      do {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          const int coeff = params.auto_regression_coeff_y[pos];
+          sum += luma_grain[(y + delta_row) * kLumaWidth + (x + delta_column)] *
+                 coeff;
+          ++pos;
+        } while (++delta_column <= auto_regression_coeff_lag);
+      } while (++delta_row < 0);
+      // Last iteration: delta_row == 0.
+      {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          const int coeff = params.auto_regression_coeff_y[pos];
+          sum += luma_grain[y * kLumaWidth + (x + delta_column)] * coeff;
+          ++pos;
+        } while (++delta_column < 0);
+      }
+      luma_grain[y * kLumaWidth + x] = Clip3(
+          luma_grain[y * kLumaWidth + x] + RightShiftWithRounding(sum, shift),
+          grain_min, grain_max);
+    }
+  }
+}
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+          bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_C(
+    const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+    int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+    void* LIBGAV1_RESTRICT v_grain_buffer) {
+  static_assert(
+      auto_regression_coeff_lag >= 0 && auto_regression_coeff_lag <= 3,
+      "Unsupported autoregression lag for chroma.");
+  const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int grain_max = GetGrainMax<bitdepth>();
+  auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+  auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+  const int shift = params.auto_regression_shift;
+  const int chroma_height =
+      (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+  const int chroma_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  for (int y = kAutoRegressionBorder; y < chroma_height; ++y) {
+    const int luma_y =
+        ((y - kAutoRegressionBorder) << subsampling_y) + kAutoRegressionBorder;
+    for (int x = kAutoRegressionBorder;
+         x < chroma_width - kAutoRegressionBorder; ++x) {
+      int sum_u = 0;
+      int sum_v = 0;
+      int pos = 0;
+      int delta_row = -auto_regression_coeff_lag;
+      do {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          if (delta_row == 0 && delta_column == 0) {
+            break;
+          }
+          const int coeff_u = params.auto_regression_coeff_u[pos];
+          const int coeff_v = params.auto_regression_coeff_v[pos];
+          sum_u +=
+              u_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+              coeff_u;
+          sum_v +=
+              v_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+              coeff_v;
+          ++pos;
+        } while (++delta_column <= auto_regression_coeff_lag);
+      } while (++delta_row <= 0);
+      if (use_luma) {
+        int luma = 0;
+        const int luma_x = ((x - kAutoRegressionBorder) << subsampling_x) +
+                           kAutoRegressionBorder;
+        int i = 0;
+        do {
+          int j = 0;
+          do {
+            luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)];
+          } while (++j <= subsampling_x);
+        } while (++i <= subsampling_y);
+        luma = SubsampledValue(luma, subsampling_x + subsampling_y);
+        const int coeff_u = params.auto_regression_coeff_u[pos];
+        const int coeff_v = params.auto_regression_coeff_v[pos];
+        sum_u += luma * coeff_u;
+        sum_v += luma * coeff_v;
+      }
+      u_grain[y * chroma_width + x] = Clip3(
+          u_grain[y * chroma_width + x] + RightShiftWithRounding(sum_u, shift),
+          grain_min, grain_max);
+      v_grain[y * chroma_width + x] = Clip3(
+          v_grain[y * chroma_width + x] + RightShiftWithRounding(sum_v, shift),
+          grain_min, grain_max);
+    }
+  }
+}
+
+// This implementation is for the condition overlap_flag == false.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripes_C(const void* LIBGAV1_RESTRICT grain_buffer,
+                             int grain_seed, int width, int height,
+                             int subsampling_x, int subsampling_y,
+                             void* LIBGAV1_RESTRICT noise_stripes_buffer) {
+  auto* noise_stripes =
+      static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+  const auto* grain = static_cast<const GrainType*>(grain_buffer);
+  const int half_width = DivideBy2(width + 1);
+  const int half_height = DivideBy2(height + 1);
+  assert(half_width > 0);
+  assert(half_height > 0);
+  static_assert(kLumaWidth == kMaxChromaWidth,
+                "kLumaWidth width should be equal to kMaxChromaWidth");
+  const int grain_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  constexpr int kNoiseStripeHeight = 34;
+  int luma_num = 0;
+  int y = 0;
+  do {
+    GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+    uint16_t seed = grain_seed;
+    seed ^= ((luma_num * 37 + 178) & 255) << 8;
+    seed ^= ((luma_num * 173 + 105) & 255);
+    int x = 0;
+    do {
+      const int rand = GetFilmGrainRandomNumber(8, &seed);
+      const int offset_x = rand >> 4;
+      const int offset_y = rand & 15;
+      const int plane_offset_x =
+          (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+      const int plane_offset_y =
+          (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+      int i = 0;
+      do {
+        // Section 7.18.3.5 says:
+        //   noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+        //   wide (a few additional samples across are actually written to
+        //   the array, but these are never read) ...
+        //
+        // Note: The warning in the parentheses also applies to
+        // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+        //
+        // Writes beyond the width of each row could happen below. To
+        // prevent those writes, we clip the number of pixels to copy against
+        // the remaining width.
+        const int copy_size =
+            std::min(kNoiseStripeHeight >> subsampling_x,
+                     plane_width - (x << (1 - subsampling_x)));
+        memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x))],
+               &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+               copy_size * sizeof(noise_stripe[0]));
+      } while (++i < (kNoiseStripeHeight >> subsampling_y));
+      x += 16;
+    } while (x < half_width);
+
+    ++luma_num;
+    y += 16;
+  } while (y < half_height);
+}
+
+// This implementation is for the condition overlap_flag == true.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripesWithOverlap_C(
+    const void* LIBGAV1_RESTRICT grain_buffer, int grain_seed, int width,
+    int height, int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_stripes_buffer) {
+  auto* noise_stripes =
+      static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+  const auto* grain = static_cast<const GrainType*>(grain_buffer);
+  const int half_width = DivideBy2(width + 1);
+  const int half_height = DivideBy2(height + 1);
+  assert(half_width > 0);
+  assert(half_height > 0);
+  static_assert(kLumaWidth == kMaxChromaWidth,
+                "kLumaWidth width should be equal to kMaxChromaWidth");
+  const int grain_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  constexpr int kNoiseStripeHeight = 34;
+  int luma_num = 0;
+  int y = 0;
+  do {
+    GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+    uint16_t seed = grain_seed;
+    seed ^= ((luma_num * 37 + 178) & 255) << 8;
+    seed ^= ((luma_num * 173 + 105) & 255);
+    // Begin special iteration for x == 0.
+    const int rand = GetFilmGrainRandomNumber(8, &seed);
+    const int offset_x = rand >> 4;
+    const int offset_y = rand & 15;
+    const int plane_offset_x =
+        (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+    const int plane_offset_y =
+        (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+    // The overlap computation only occurs when x > 0, so it is omitted here.
+    int i = 0;
+    do {
+      const int copy_size =
+          std::min(kNoiseStripeHeight >> subsampling_x, plane_width);
+      memcpy(&noise_stripe[i * plane_width],
+             &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+             copy_size * sizeof(noise_stripe[0]));
+    } while (++i < (kNoiseStripeHeight >> subsampling_y));
+    // End special iteration for x == 0.
+    for (int x = 16; x < half_width; x += 16) {
+      const int rand = GetFilmGrainRandomNumber(8, &seed);
+      const int offset_x = rand >> 4;
+      const int offset_y = rand & 15;
+      const int plane_offset_x =
+          (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+      const int plane_offset_y =
+          (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+      int i = 0;
+      do {
+        int j = 0;
+        int grain_sample =
+            grain[(plane_offset_y + i) * grain_width + plane_offset_x];
+        // The first pixel(s) of each segment of the noise_stripe are subject to
+        // the "overlap" computation.
+        if (subsampling_x == 0) {
+          // Corresponds to the line in the spec:
+          // if (j < 2 && x > 0)
+          // j = 0
+          int old = noise_stripe[i * plane_width + x * 2];
+          grain_sample = old * 27 + grain_sample * 17;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x * 2] = grain_sample;
+
+          // This check prevents overwriting for the iteration j = 1. The
+          // continue applies to the i-loop.
+          if (x * 2 + 1 >= plane_width) continue;
+          // j = 1
+          grain_sample =
+              grain[(plane_offset_y + i) * grain_width + plane_offset_x + 1];
+          old = noise_stripe[i * plane_width + x * 2 + 1];
+          grain_sample = old * 17 + grain_sample * 27;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x * 2 + 1] = grain_sample;
+          j = 2;
+        } else {
+          // Corresponds to the line in the spec:
+          // if (j == 0 && x > 0)
+          const int old = noise_stripe[i * plane_width + x];
+          grain_sample = old * 23 + grain_sample * 22;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x] = grain_sample;
+          j = 1;
+        }
+        // The following covers the rest of the loop over j as described in the
+        // spec.
+        //
+        // Section 7.18.3.5 says:
+        //   noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+        //   wide (a few additional samples across are actually written to
+        //   the array, but these are never read) ...
+        //
+        // Note: The warning in the parentheses also applies to
+        // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+        //
+        // Writes beyond the width of each row could happen below. To
+        // prevent those writes, we clip the number of pixels to copy against
+        // the remaining width.
+        const int copy_size =
+            std::min(kNoiseStripeHeight >> subsampling_x,
+                     plane_width - (x << (1 - subsampling_x))) -
+            j;
+        memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x)) + j],
+               &grain[(plane_offset_y + i) * grain_width + plane_offset_x + j],
+               copy_size * sizeof(noise_stripe[0]));
+      } while (++i < (kNoiseStripeHeight >> subsampling_y));
+    }
+
+    ++luma_num;
+    y += 16;
+  } while (y < half_height);
+}
+
+template <int bitdepth, typename GrainType>
+inline void WriteOverlapLine_C(
+    const GrainType* LIBGAV1_RESTRICT noise_stripe_row,
+    const GrainType* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    int grain_coeff, int old_coeff,
+    GrainType* LIBGAV1_RESTRICT noise_image_row) {
+  int x = 0;
+  do {
+    int grain = noise_stripe_row[x];
+    const int old = noise_stripe_row_prev[x];
+    grain = old * old_coeff + grain * grain_coeff;
+    grain = Clip3(RightShiftWithRounding(grain, 5), GetGrainMin<bitdepth>(),
+                  GetGrainMax<bitdepth>());
+    noise_image_row[x] = grain;
+  } while (++x < plane_width);
+}
+
+template <int bitdepth, typename GrainType>
+void ConstructNoiseImageOverlap_C(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<GrainType>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<GrainType>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    // Begin complete stripes section. This is when we are guaranteed to have
+    // two overlap rows in each stripe.
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+      const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      // First overlap row.
+      WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                   &noise_stripe_prev[32 * plane_width],
+                                   plane_width, 17, 27, (*noise_image)[y]);
+      // Second overlap row.
+      WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+                                   &noise_stripe_prev[(32 + 1) * plane_width],
+                                   plane_width, 27, 17, (*noise_image)[y + 1]);
+    }
+    // End complete stripes section.
+
+    const int remaining_height = plane_height - y;
+    // Either one partial stripe remains (remaining_height  > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    if (remaining_height <= 0) {
+      return;
+    }
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                 &noise_stripe_prev[32 * plane_width],
+                                 plane_width, 17, 27, (*noise_image)[y]);
+
+    // Check if second overlap row is in the image.
+    if (remaining_height > 1) {
+      WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+                                   &noise_stripe_prev[(32 + 1) * plane_width],
+                                   plane_width, 27, 17, (*noise_image)[y + 1]);
+    }
+  } else {  // |subsampling_y| == 1
+    // No special checks needed for partial stripes, because if one exists, the
+    // first and only overlap row is guaranteed to exist.
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+      const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                   &noise_stripe_prev[16 * plane_width],
+                                   plane_width, 22, 23, (*noise_image)[y]);
+    }
+  }
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_C(const void* LIBGAV1_RESTRICT noise_image_ptr,
+                               int min_value, int max_luma, int scaling_shift,
+                               int width, int height, int start_height,
+                               const int16_t* scaling_lut_y,
+                               const void* source_plane_y,
+                               ptrdiff_t source_stride_y, void* dest_plane_y,
+                               ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int orig = in_y[y * source_stride_y + x];
+      int noise = noise_image[kPlaneY][y + start_height][x];
+      noise = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut_y, orig) * noise, scaling_shift);
+      out_y[y * dest_stride_y + x] = Clip3(orig + noise, min_value, max_luma);
+    } while (++x < width);
+  } while (++y < height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChroma_C(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut_uv,
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+
+  const int scaling_shift = params.chroma_scaling;
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int luma_y = y << subsampling_y;
+      const int luma_next_x = std::min(luma_x + 1, width - 1);
+      int average_luma;
+      if (subsampling_x != 0) {
+        average_luma = RightShiftWithRounding(
+            in_y[luma_y * source_stride_y + luma_x] +
+                in_y[luma_y * source_stride_y + luma_next_x],
+            1);
+      } else {
+        average_luma = in_y[luma_y * source_stride_y + luma_x];
+      }
+      const int orig = in_uv[y * source_stride_uv + x];
+      const int combined = average_luma * luma_multiplier + orig * multiplier;
+      const int merged =
+          Clip3((combined >> 6) + LeftShift(offset, bitdepth - kBitdepth8), 0,
+                (1 << bitdepth) - 1);
+      int noise = noise_image[plane][y + start_height][x];
+      noise = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut_uv, merged) * noise, scaling_shift);
+      out_uv[y * dest_stride_uv + x] =
+          Clip3(orig + noise, min_value, max_chroma);
+    } while (++x < chroma_width);
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_C(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut, const void* source_plane_y,
+    ptrdiff_t source_stride_y, const void* source_plane_uv,
+    ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int scaling_shift = params.chroma_scaling;
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int luma_y = y << subsampling_y;
+      const int luma_next_x = std::min(luma_x + 1, width - 1);
+      int average_luma;
+      if (subsampling_x != 0) {
+        average_luma = RightShiftWithRounding(
+            in_y[luma_y * source_stride_y + luma_x] +
+                in_y[luma_y * source_stride_y + luma_next_x],
+            1);
+      } else {
+        average_luma = in_y[luma_y * source_stride_y + luma_x];
+      }
+      const int orig_uv = in_uv[y * source_stride_uv + x];
+      int noise_uv = noise_image[plane][y + start_height][x];
+      noise_uv = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut, average_luma) * noise_uv,
+          scaling_shift);
+      out_uv[y * dest_stride_uv + x] =
+          Clip3(orig_uv + noise_uv, min_value, max_chroma);
+    } while (++x < chroma_width);
+  } while (++y < chroma_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth8, int8_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth8, int8_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth8, int8_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth8>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth8, int8_t, uint8_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth8, int8_t, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth8, int8_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth8>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth10, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth10, int16_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth10, int16_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth10>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth10, int16_t, uint16_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth10, int16_t, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth10, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth10>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth12>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth12>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace film_grain
+
+void FilmGrainInit_C() {
+  film_grain::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  film_grain::Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h
new file mode 100644 (file)
index 0000000..f75a354
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/film_grain_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/film_grain_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize Dsp::film_grain_synthesis. This function is not thread-safe.
+void FilmGrainInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_FILM_GRAIN_H_
diff --git a/src/dsp/film_grain_common.h b/src/dsp/film_grain_common.h
new file mode 100644 (file)
index 0000000..3c8d761
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+template <int bitdepth>
+int GetGrainMax() {
+  return (1 << (bitdepth - 1)) - 1;
+}
+
+template <int bitdepth>
+int GetGrainMin() {
+  return -(1 << (bitdepth - 1));
+}
+
+inline int GetFilmGrainRandomNumber(int bits, uint16_t* seed) {
+  uint16_t s = *seed;
+  uint16_t bit = (s ^ (s >> 1) ^ (s >> 3) ^ (s >> 12)) & 1;
+  s = (s >> 1) | (bit << 15);
+  *seed = s;
+  return s >> (16 - bits);
+}
+
+enum {
+  kAutoRegressionBorder = 3,
+  // The width of the luma noise array.
+  kLumaWidth = 82,
+  // The height of the luma noise array.
+  kLumaHeight = 73,
+  // The two possible widths of the chroma noise array.
+  kMinChromaWidth = 44,
+  kMaxChromaWidth = 82,
+  // The two possible heights of the chroma noise array.
+  kMinChromaHeight = 38,
+  kMaxChromaHeight = 73,
+  // The standard scaling lookup table maps bytes to bytes, so only uses 256
+  // elements, plus one for overflow in 12bpp lookups. The size is scaled up for
+  // 10bpp.
+  kScalingLookupTableSize = 257,
+  // Padding is added to the scaling lookup table to permit overwrites by
+  // InitializeScalingLookupTable_NEON.
+  kScalingLookupTablePadding = 6,
+  // Padding is added to each row of the noise image to permit overreads by
+  // BlendNoiseWithImageLuma_NEON and overwrites by WriteOverlapLine8bpp_NEON.
+  kNoiseImagePadding = 15,
+  // Padding is added to the end of the |noise_stripes_| buffer to permit
+  // overreads by WriteOverlapLine8bpp_NEON.
+  kNoiseStripePadding = 7,
+};  // anonymous enum
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
diff --git a/src/dsp/intra_edge.cc b/src/dsp/intra_edge.cc
new file mode 100644 (file)
index 0000000..9875ef1
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+    {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxUpsampleSize = 16;
+
+template <typename Pixel>
+void IntraEdgeFilter_C(void* buffer, int size, int strength) {
+  assert(strength > 0);
+  Pixel edge[129];
+  memcpy(edge, buffer, sizeof(edge[0]) * size);
+  auto* const dst_buffer = static_cast<Pixel*>(buffer);
+  const int kernel_index = strength - 1;
+  for (int i = 1; i < size; ++i) {
+    int sum = 0;
+    for (int j = 0; j < kKernelTaps; ++j) {
+      const int k = Clip3(i + j - 2, 0, size - 1);
+      sum += kKernels[kernel_index][j] * edge[k];
+    }
+    dst_buffer[i] = RightShiftWithRounding(sum, 4);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsampler_C(void* buffer, int size) {
+  assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+  auto* const pixel_buffer = static_cast<Pixel*>(buffer);
+  Pixel temp[kMaxUpsampleSize + 3];
+  temp[0] = temp[1] = pixel_buffer[-1];
+  memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+  temp[size + 2] = pixel_buffer[size - 1];
+
+  pixel_buffer[-2] = temp[0];
+  for (int i = 0; i < size; ++i) {
+    const int sum =
+        -temp[i] + (9 * temp[i + 1]) + (9 * temp[i + 2]) - temp[i + 3];
+    pixel_buffer[2 * i - 1] =
+        Clip3(RightShiftWithRounding(sum, 4), 0, (1 << bitdepth) - 1);
+    pixel_buffer[2 * i] = temp[i + 2];
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void IntraEdgeInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intra_edge.h b/src/dsp/intra_edge.h
new file mode 100644 (file)
index 0000000..172ecbb
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+#define LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intra_edge_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intra_edge_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRA_EDGE_H_
diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc
new file mode 100644 (file)
index 0000000..75c45be
--- /dev/null
@@ -0,0 +1,556 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+const char kIntraEdge[] = "IntraEdge";
+const char kIntraEdgeFilterName[] = "Intra Edge Filter";
+const char kIntraEdgeUpsamplerName[] = "Intra Edge Upsampler";
+
+constexpr int kIntraEdgeBufferSize = 144;  // see Tile::IntraPrediction.
+constexpr int kIntraEdgeFilterTestMaxSize = 129;
+constexpr int kIntraEdgeFilterTestFixedInput[kIntraEdgeFilterTestMaxSize] = {
+    159, 208, 54,  136, 205, 124, 125, 165, 164, 63,  171, 143, 210, 236, 253,
+    233, 139, 113, 66,  211, 133, 61,  91,  123, 187, 76,  110, 172, 61,  103,
+    239, 147, 247, 120, 18,  106, 180, 159, 208, 54,  136, 205, 124, 125, 165,
+    164, 63,  171, 143, 210, 236, 253, 233, 139, 113, 66,  211, 133, 61,  91,
+    123, 187, 76,  110, 172, 61,  103, 239, 147, 247, 120, 18,  106, 180, 159,
+    208, 54,  136, 205, 124, 125, 165, 164, 63,  171, 143, 210, 236, 253, 233,
+    139, 113, 66,  211, 133, 61,  91,  123, 187, 76,  110, 172, 61,  103, 239,
+    147, 247, 120, 18,  106, 180, 159, 208, 54,  136, 205, 124, 125, 165, 164,
+    63,  171, 143, 210, 236, 253, 233, 139, 113,
+};
+constexpr int kIntraEdgeUpsamplerTestFixedInput[] = {
+    208, 54,  136, 205, 124, 125, 165, 164, 63,
+    171, 143, 210, 236, 208, 54,  136, 205};
+
+struct EdgeFilterParams {
+  int size;
+  int strength;
+};
+
+std::ostream& operator<<(std::ostream& os, const EdgeFilterParams& param) {
+  return os << "size: " << param.size << ", strength: " << param.strength;
+}
+
+// Each size is paired with strength 1, 2, and 3.
+// In general, the size is expressible as 2^n+1, but all sizes up to 129 are
+// permissible.
+constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = {
+    {1, 1},  {1, 2},  {1, 3},  {2, 1},   {2, 2},   {2, 3},  {5, 1},  {5, 2},
+    {5, 3},  {9, 1},  {9, 2},  {9, 3},   {17, 1},  {17, 2}, {17, 3}, {33, 1},
+    {33, 2}, {33, 3}, {50, 1}, {50, 2},  {50, 3},  {55, 1}, {55, 2}, {55, 3},
+    {65, 1}, {65, 2}, {65, 3}, {129, 1}, {129, 2}, {129, 3}};
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  IntraEdgeFilterTest() = default;
+  IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete;
+  IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete;
+  ~IntraEdgeFilterTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    IntraEdgeInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_intra_edge_filter_ = dsp->intra_edge_filter;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_intra_edge_filter_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      IntraEdgeInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraEdgeInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+#if LIBGAV1_MSAN
+    // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+    // assembly code (safely) overreading to fill a register.
+    memset(buffer_, 0, sizeof(buffer_));
+#endif  // LIBGAV1_MSAN
+    cur_intra_edge_filter_ = dsp->intra_edge_filter;
+  }
+
+  void TestFixedValues(const char* digest);
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[kIntraEdgeBufferSize];
+  Pixel base_buffer_[kIntraEdgeBufferSize];
+  int strength_ = GetParam().strength;
+  int size_ = GetParam().size;
+
+  IntraEdgeFilterFunc base_intra_edge_filter_;
+  IntraEdgeFilterFunc cur_intra_edge_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestFixedValues(
+    const char* const digest) {
+  if (cur_intra_edge_filter_ == nullptr) return;
+  for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+    buffer_[i] = kIntraEdgeFilterTestFixedInput[i];
+  }
+  const absl::Time start = absl::Now();
+  cur_intra_edge_filter_(buffer_, size_, strength_);
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeFilterName, digest, buffer_,
+                             kIntraEdgeFilterTestMaxSize * sizeof(buffer_[0]),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (base_intra_edge_filter_ == nullptr) return;
+  if (cur_intra_edge_filter_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration elapsed_time;
+  absl::Duration base_elapsed_time;
+  memset(base_buffer_, 0, sizeof(base_buffer_));
+  memset(buffer_, 0, sizeof(buffer_));
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int i = 0; i < size_; ++i) {
+      const Pixel val = rnd(1 << bitdepth);
+      buffer_[i] = val;
+      base_buffer_[i] = val;
+    }
+    const absl::Time base_start = absl::Now();
+    base_intra_edge_filter_(base_buffer_, size_, strength_);
+    base_elapsed_time += absl::Now() - base_start;
+    const absl::Time start = absl::Now();
+    cur_intra_edge_filter_(buffer_, size_, strength_);
+    elapsed_time += absl::Now() - start;
+  }
+  if (num_runs > 1) {
+    printf("Mode %s[%31s] Size %3d Strength %d C: %5d us SIMD: %5d us %2.2fx\n",
+           kIntraEdge, kIntraEdgeFilterName, size_, strength_,
+           static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+           absl::ToDoubleMicroseconds(base_elapsed_time) /
+               absl::ToDoubleMicroseconds(elapsed_time));
+  } else {
+    printf("Mode %s[%31s] Size %3d Strength %d\n", kIntraEdge,
+           kIntraEdgeFilterName, size_, strength_);
+  }
+  for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+    EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+  }
+}
+
+using IntraEdgeFilterTest8bpp = IntraEdgeFilterTest<8, uint8_t>;
+
+const char* GetIntraEdgeFilterDigest8bpp(int strength, int size) {
+  static const char* const kDigestsSize1[3] = {
+      "f7f681cf7047602fafc7fb416ecf46e1", "f7f681cf7047602fafc7fb416ecf46e1",
+      "f7f681cf7047602fafc7fb416ecf46e1"};
+  static const char* const kDigestsSize2[3] = {
+      "cb24cc54900fb75d767f3de797451e43", "380c80c89e1e8cda81ee0d3d4b29b8b7",
+      "a7eb3dba95ff35c2df45a274afbc9772"};
+  static const char* const kDigestsSize5[3] = {
+      "23380cb37688d4c3a8f70a276be65eed", "ec1e23d5b996a527ed3d45c0d552bf22",
+      "d313523d3b7646fdbb873c61ffe7a51a"};
+  static const char* const kDigestsSize9[3] = {
+      "e79597e9d62893754fc77d80ca86329a", "f7644e9748984914100e7031c6432272",
+      "bdf4f16734c86338716fb436c196ecc6"};
+  static const char* const kDigestsSize17[3] = {
+      "13ad15c833e850348eecb9fea4f3cadb", "e5988a72391250c702a8192893df40dd",
+      "8f68603598638fa33203fe1233d273b1"};
+  static const char* const kDigestsSize33[3] = {
+      "51156da8f4d527e0c011040769987dbd", "eff17eaf73a7bb7fd4c921510ade9f67",
+      "aca87680e0649d0728091c92c6de8871"};
+  static const char* const kDigestsSize50[3] = {
+      "87c1d43751125f1ea4987517a90d378d", "942a9d056231683bdfc52346b6b032c2",
+      "16a9148daf0e5f69808b9f0caa1ef110"};
+  static const char* const kDigestsSize55[3] = {
+      "833480d74957fb0356dec5b09412eefa", "a307ef31f10affc3b7fb262d05f1b80a",
+      "0318b2fde088c472215fe155f3b48d36"};
+  static const char* const kDigestsSize65[3] = {
+      "5000dada34ed2e6692bb44a4398ddf53", "8da6c776d897064ecd4a1e84aae92dd3",
+      "d7c71db339c28d33119974987b2f9d85"};
+  static const char* const kDigestsSize129[3] = {
+      "bf174d8b45b8131404fd4a4686f8c117", "e81518d6d85eed2f1b18c59424561d6b",
+      "7306715602b0f5536771724a2f0a39bc"};
+
+  switch (size) {
+    case 1:
+      return kDigestsSize1[strength - 1];
+    case 2:
+      return kDigestsSize2[strength - 1];
+    case 5:
+      return kDigestsSize5[strength - 1];
+    case 9:
+      return kDigestsSize9[strength - 1];
+    case 17:
+      return kDigestsSize17[strength - 1];
+    case 33:
+      return kDigestsSize33[strength - 1];
+    case 50:
+      return kDigestsSize50[strength - 1];
+    case 55:
+      return kDigestsSize55[strength - 1];
+    case 65:
+      return kDigestsSize65[strength - 1];
+    case 129:
+      return kDigestsSize129[strength - 1];
+    default:
+      ADD_FAILURE() << "Unknown edge size: " << size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, Correctness) {
+  TestFixedValues(GetIntraEdgeFilterDigest8bpp(strength_, size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeFilterTest10bpp = IntraEdgeFilterTest<10, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest10bpp(int strength, int size) {
+  static const char* const kDigestsSize1[3] = {
+      "2d2088560e3ccb5b809c97f5299bb1c0", "2d2088560e3ccb5b809c97f5299bb1c0",
+      "2d2088560e3ccb5b809c97f5299bb1c0"};
+  static const char* const kDigestsSize2[3] = {
+      "db3e785852e98fba18a1fb531f68466c", "8caea330489bc6ed0f99fbf769f53181",
+      "bcdd1b21f3baf5f6f29caea9ef93fb0c"};
+  static const char* const kDigestsSize5[3] = {
+      "326f4193a62f5a959b86d95f5204608e", "4673e453203f75eae97ef44f43f098f2",
+      "48d516b06313683aca30e975ce6a3cad"};
+  static const char* const kDigestsSize9[3] = {
+      "79217575a32e36a51d9dd40621af9c2d", "ccec1c16bc09b28ad6513c5e4c48b6d2",
+      "bb61aa9c5fa720c667a053769e7b7d08"};
+  static const char* const kDigestsSize17[3] = {
+      "46d90e99ba46e89326a5fa547bcd9361", "824aee8950aecb356d5f4a91dbc90a7d",
+      "37d44d10a2545385af1da55f8c08564f"};
+  static const char* const kDigestsSize33[3] = {
+      "c95108e06eb2aef61ecb6839af306edd", "832c695460b4dd2b85c5f8726e4470d1",
+      "994902f549eefd83fbcbf7ecb7dc5cca"};
+  static const char* const kDigestsSize50[3] = {
+      "48119ef1436c3a4fe69d275bbaafedf8", "72c221c91c3df0a324ccbc9acea35f89",
+      "84e40aadcc416ef3f51cea3cc23b30c7"};
+  static const char* const kDigestsSize55[3] = {
+      "6b68e4e0b00c4eb38a6d0d83c0f34658", "43a919f928a80379df5c9e07c9d8000d",
+      "7c320d55b11f93185b811bdaa379f2db"};
+  static const char* const kDigestsSize65[3] = {
+      "c28de89cf9f3bc5a904647ab2c64caf7", "7ce63b1b28dce0624fc7586e8fb3ab8f",
+      "d06e6b88585f7f1a1f6af5bb59ee2180"};
+  static const char* const kDigestsSize129[3] = {
+      "79160902c5c85004382d5ffa549b43cc", "3b0df95c3ca7b0b559b79234cf434738",
+      "500786d8561effec283d4f3d13886f8c"};
+
+  switch (size) {
+    case 1:
+      return kDigestsSize1[strength - 1];
+    case 2:
+      return kDigestsSize2[strength - 1];
+    case 5:
+      return kDigestsSize5[strength - 1];
+    case 9:
+      return kDigestsSize9[strength - 1];
+    case 17:
+      return kDigestsSize17[strength - 1];
+    case 33:
+      return kDigestsSize33[strength - 1];
+    case 50:
+      return kDigestsSize50[strength - 1];
+    case 55:
+      return kDigestsSize55[strength - 1];
+    case 65:
+      return kDigestsSize65[strength - 1];
+    case 129:
+      return kDigestsSize129[strength - 1];
+    default:
+      ADD_FAILURE() << "Unknown edge size: " << size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeFilterDigest10bpp(strength_, size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraEdgeFilterTest12bpp = IntraEdgeFilterTest<12, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest12bpp(int strength, int size) {
+  return GetIntraEdgeFilterDigest10bpp(strength, size);
+}
+
+TEST_P(IntraEdgeFilterTest12bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeFilterDigest12bpp(strength_, size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest12bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  IntraEdgeUpsamplerTest() = default;
+  IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete;
+  IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete;
+  ~IntraEdgeUpsamplerTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    IntraEdgeInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_intra_edge_upsampler_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      IntraEdgeInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraEdgeInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+#if LIBGAV1_MSAN
+    // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+    // assembly code (safely) overreading to fill a register.
+    memset(buffer_, 0, sizeof(buffer_));
+#endif
+  }
+
+  void TestFixedValues(const char* digest);
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[128];
+  Pixel base_buffer_[128];
+  int size_ = GetParam();
+
+  IntraEdgeUpsamplerFunc base_intra_edge_upsampler_;
+  IntraEdgeUpsamplerFunc cur_intra_edge_upsampler_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestFixedValues(
+    const char* const digest) {
+  if (cur_intra_edge_upsampler_ == nullptr) return;
+  buffer_[0] = 0;
+  for (int i = 0; i < size_ + 1; ++i) {
+    buffer_[i + 1] = kIntraEdgeUpsamplerTestFixedInput[i];
+  }
+  const absl::Time start = absl::Now();
+  cur_intra_edge_upsampler_(buffer_ + 2, size_);
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeUpsamplerName, digest,
+                             buffer_, (size_ * 2 + 1) * sizeof(buffer_[0]),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (base_intra_edge_upsampler_ == nullptr) return;
+  if (cur_intra_edge_upsampler_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration base_elapsed_time;
+  absl::Duration elapsed_time;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    // Populate what will be buffer[-2..size] when passed to the upsample
+    // function.
+    buffer_[0] = 0;
+    base_buffer_[0] = 0;
+    for (int i = 1; i < size_ + 2; ++i) {
+      const Pixel val = rnd(1 << bitdepth);
+      buffer_[i] = val;
+      base_buffer_[i] = val;
+    }
+    const absl::Time base_start = absl::Now();
+    base_intra_edge_upsampler_(base_buffer_ + 2, size_);
+    base_elapsed_time += absl::Now() - base_start;
+    const absl::Time start = absl::Now();
+    cur_intra_edge_upsampler_(buffer_ + 2, size_);
+    elapsed_time += absl::Now() - start;
+  }
+  if (num_runs > 1) {
+    printf("Mode %s[%31s] size %d C: %5d us SIMD: %5d us %2.2fx\n", kIntraEdge,
+           kIntraEdgeUpsamplerName, size_,
+           static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+           absl::ToDoubleMicroseconds(base_elapsed_time) /
+               absl::ToDoubleMicroseconds(elapsed_time));
+  } else {
+    printf("Mode %s[%31s]: size %d \n", kIntraEdge, kIntraEdgeUpsamplerName,
+           size_);
+  }
+
+  for (int i = 0; i < size_ * 2 + 1; ++i) {
+    EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+  }
+}
+
+using IntraEdgeUpsamplerTest8bpp = IntraEdgeUpsamplerTest<8, uint8_t>;
+
+constexpr int kIntraEdgeUpsampleSizes[] = {4, 8, 12, 16};
+
+const char* GetIntraEdgeUpsampleDigest8bpp(int size) {
+  switch (size) {
+    case 4:
+      return "aa9002e03f8d15eb26bbee76f40bb923";
+    case 8:
+      return "cacfca86d65eff0d951eb21fc15f242a";
+    case 12:
+      return "0529e00a1fa80bc866fa7662ad2d7b9f";
+    case 16:
+      return "03e3b3e0ea438ea48ef05651c0a54986";
+    default:
+      ADD_FAILURE() << "Unknown upsample size: " << size;
+      return "";
+  }
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, Correctness) {
+  TestFixedValues(GetIntraEdgeUpsampleDigest8bpp(size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeUpsamplerTest10bpp = IntraEdgeUpsamplerTest<10, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest10bpp(int size) {
+  switch (size) {
+    case 4:
+      return "341c6bb705a02bba65b34f92d8ca83cf";
+    case 8:
+      return "fdbe4b3b341921dcb0edf00dfc4d7667";
+    case 12:
+      return "ad69a491287495ec9973d4006d5ac461";
+    case 16:
+      return "04acf32e517d80ce4c4958e711b9b890";
+    default:
+      ADD_FAILURE() << "Unknown upsample size: " << size;
+      return "";
+  }
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeUpsampleDigest10bpp(size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraEdgeUpsamplerTest12bpp = IntraEdgeUpsamplerTest<12, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest12bpp(int size) {
+  return GetIntraEdgeUpsampleDigest10bpp(size);
+}
+
+TEST_P(IntraEdgeUpsamplerTest12bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeUpsampleDigest12bpp(size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest12bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest10bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest10bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest10bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest12bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest12bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
new file mode 100644 (file)
index 0000000..3162acc
--- /dev/null
@@ -0,0 +1,1985 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct IntraPredFuncs_C {
+  IntraPredFuncs_C() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+  static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+                       const void* left_column);
+  static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+                         const void* left_column);
+  static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+};
+
+// Intra-predictors that require bitdepth.
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+struct IntraPredBppFuncs_C {
+  IntraPredBppFuncs_C() = delete;
+
+  static void DcFill(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+};
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C::DcPred
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcTop(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
+  int sum = block_width >> 1;  // rounder
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  for (int x = 0; x < block_width; ++x) sum += top[x];
+  const int dc = sum >> FloorLog2(block_width);
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcLeft(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+  int sum = block_height >> 1;  // rounder
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  for (int y = 0; y < block_height; ++y) sum += left[y];
+  const int dc = sum >> FloorLog2(block_height);
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+// Note for square blocks the divide in the Dc() function reduces to a shift.
+// For rectangular block sizes the following multipliers can be used with the
+// corresponding shifts.
+// 8-bit
+//  1:2 (e.g,, 4x8):  scale = 0x5556
+//  1:4 (e.g., 4x16): scale = 0x3334
+//  final_descale = 16
+// 10/12-bit
+//  1:2: scale = 0xaaab
+//  1:4: scale = 0x6667
+//  final_descale = 17
+//  Note these may be halved to the values used in 8-bit in all cases except
+//  when bitdepth == 12 and block_width + block_height is divisible by 5 (as
+//  opposed to 3).
+//
+// The calculation becomes:
+//  (dc_sum >> intermediate_descale) * scale) >> final_descale
+// where intermediate_descale is:
+// sum = block_width + block_height
+// intermediate_descale =
+//     (sum <= 20) ? 2 : (sum <= 40) ? 3 : (sum <= 80) ? 4 : 5
+//
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum = block width + block height
+// - Shift 'sum' right until we reach an odd number
+// - Let the number of shifts for that block size be called 'intermediate_scale'
+//   and let the odd number be 'd' (d has only 2 possible values: d = 3 for a
+//   1:2 rectangular block and d = 5 for a 1:4 rectangular block).
+// - Find multipliers by dividing by 'd' using "Algorithm 1" in:
+//   http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+//   by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+//   shift will be 16, regardless of the block size.
+// TODO(jzern): the base implementation could be updated to use this method.
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Dc(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const int divisor = block_width + block_height;
+  int sum = divisor >> 1;  // rounder
+
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  for (int x = 0; x < block_width; ++x) sum += top[x];
+  for (int y = 0; y < block_height; ++y) sum += left[y];
+
+  const int dc = sum / divisor;
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C directional predictors
+
+// IntraPredFuncs_C::Vertical -- apply top row vertically
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Vertical(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < block_height; ++y) {
+    memcpy(dst, top_row, block_width * sizeof(Pixel));
+    dst += stride;
+  }
+}
+
+// IntraPredFuncs_C::Horizontal -- apply left column horizontally
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, left[y], block_width);
+    dst += stride;
+  }
+}
+
+// IntraPredFuncs_C::Paeth
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_left = top[-1];
+  const int top_left_x2 = top_left + top_left;
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    const int left_pixel = left[y];
+    for (int x = 0; x < block_width; ++x) {
+      // The Paeth filter selects the value closest to:
+      // top[x] + left[y] - top_left
+      // To calculate the absolute distance for the left value this would be:
+      // abs((top[x] + left[y] - top_left) - left[y])
+      // or, because left[y] cancels out:
+      // abs(top[x] - top_left)
+      const int left_dist = std::abs(top[x] - top_left);
+      const int top_dist = std::abs(left_pixel - top_left);
+      const int top_left_dist = std::abs(top[x] + left_pixel - top_left_x2);
+
+      // Select the closest value to the initial estimate of 'T + L - TL'.
+      if (left_dist <= top_dist && left_dist <= top_left_dist) {
+        dst[x] = left_pixel;
+      } else if (top_dist <= top_left_dist) {
+        dst[x] = top[x];
+      } else {
+        dst[x] = top_left;
+      }
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredBppFuncs_C
+template <int fill, typename Pixel>
+inline void DcFill_C(void* const dest, ptrdiff_t stride, const int block_width,
+                     const int block_height) {
+  static_assert(sizeof(Pixel) == 1 || sizeof(Pixel) == 2,
+                "Only 1 & 2 byte pixels are supported");
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, fill, block_width);
+    dst += stride;
+  }
+}
+
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill(
+    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+    const void* /*left_column*/) {
+  DcFill_C<0x80 << (bitdepth - 8), Pixel>(dest, stride, block_width,
+                                          block_height);
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct IntraPredDefs {
+  IntraPredDefs() = delete;
+
+  using _4x4 = IntraPredFuncs_C<4, 4, Pixel>;
+  using _4x8 = IntraPredFuncs_C<4, 8, Pixel>;
+  using _4x16 = IntraPredFuncs_C<4, 16, Pixel>;
+  using _8x4 = IntraPredFuncs_C<8, 4, Pixel>;
+  using _8x8 = IntraPredFuncs_C<8, 8, Pixel>;
+  using _8x16 = IntraPredFuncs_C<8, 16, Pixel>;
+  using _8x32 = IntraPredFuncs_C<8, 32, Pixel>;
+  using _16x4 = IntraPredFuncs_C<16, 4, Pixel>;
+  using _16x8 = IntraPredFuncs_C<16, 8, Pixel>;
+  using _16x16 = IntraPredFuncs_C<16, 16, Pixel>;
+  using _16x32 = IntraPredFuncs_C<16, 32, Pixel>;
+  using _16x64 = IntraPredFuncs_C<16, 64, Pixel>;
+  using _32x8 = IntraPredFuncs_C<32, 8, Pixel>;
+  using _32x16 = IntraPredFuncs_C<32, 16, Pixel>;
+  using _32x32 = IntraPredFuncs_C<32, 32, Pixel>;
+  using _32x64 = IntraPredFuncs_C<32, 64, Pixel>;
+  using _64x16 = IntraPredFuncs_C<64, 16, Pixel>;
+  using _64x32 = IntraPredFuncs_C<64, 32, Pixel>;
+  using _64x64 = IntraPredFuncs_C<64, 64, Pixel>;
+};
+
+template <int bitdepth, typename Pixel>
+struct IntraPredBppDefs {
+  IntraPredBppDefs() = delete;
+
+  using _4x4 = IntraPredBppFuncs_C<4, 4, bitdepth, Pixel>;
+  using _4x8 = IntraPredBppFuncs_C<4, 8, bitdepth, Pixel>;
+  using _4x16 = IntraPredBppFuncs_C<4, 16, bitdepth, Pixel>;
+  using _8x4 = IntraPredBppFuncs_C<8, 4, bitdepth, Pixel>;
+  using _8x8 = IntraPredBppFuncs_C<8, 8, bitdepth, Pixel>;
+  using _8x16 = IntraPredBppFuncs_C<8, 16, bitdepth, Pixel>;
+  using _8x32 = IntraPredBppFuncs_C<8, 32, bitdepth, Pixel>;
+  using _16x4 = IntraPredBppFuncs_C<16, 4, bitdepth, Pixel>;
+  using _16x8 = IntraPredBppFuncs_C<16, 8, bitdepth, Pixel>;
+  using _16x16 = IntraPredBppFuncs_C<16, 16, bitdepth, Pixel>;
+  using _16x32 = IntraPredBppFuncs_C<16, 32, bitdepth, Pixel>;
+  using _16x64 = IntraPredBppFuncs_C<16, 64, bitdepth, Pixel>;
+  using _32x8 = IntraPredBppFuncs_C<32, 8, bitdepth, Pixel>;
+  using _32x16 = IntraPredBppFuncs_C<32, 16, bitdepth, Pixel>;
+  using _32x32 = IntraPredBppFuncs_C<32, 32, bitdepth, Pixel>;
+  using _32x64 = IntraPredBppFuncs_C<32, 64, bitdepth, Pixel>;
+  using _64x16 = IntraPredBppFuncs_C<64, 16, bitdepth, Pixel>;
+  using _64x32 = IntraPredBppFuncs_C<64, 32, bitdepth, Pixel>;
+  using _64x64 = IntraPredBppFuncs_C<64, 64, bitdepth, Pixel>;
+};
+
+using Defs = IntraPredDefs<uint8_t>;
+using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS|/|DEFSBPP| of
+// the same size.
+#define INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, W, H)                         \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcFill] =     \
+      DEFSBPP::_##W##x##H::DcFill;                                            \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcTop] =      \
+      DEFS::_##W##x##H::DcTop;                                                \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcLeft] =     \
+      DEFS::_##W##x##H::DcLeft;                                               \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDc] =         \
+      DEFS::_##W##x##H::Dc;                                                   \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorVertical] =   \
+      DEFS::_##W##x##H::Vertical;                                             \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
+      DEFS::_##W##x##H::Horizontal;                                           \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] =      \
+      DEFS::_##W##x##H::Paeth
+
+#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP)        \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 8);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 16);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 4);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 8);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 16);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 32);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 4);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 8);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 64); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 8);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 64); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(Defs, Defs8bpp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs8bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      Defs::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      Defs::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = Defs::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Defs::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      Defs::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Defs::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs8bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      Defs::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      Defs::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = Defs::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Defs::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Defs::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Defs::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs8bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      Defs::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      Defs::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      Defs::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Defs::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Defs::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Defs::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs8bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      Defs::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      Defs::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = Defs::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Defs::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      Defs::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Defs::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs8bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      Defs::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      Defs::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = Defs::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Defs::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Defs::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Defs::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs8bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      Defs::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      Defs::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      Defs::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Defs::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      Defs::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Defs::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs8bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      Defs::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      Defs::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      Defs::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Defs::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Defs::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Defs::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs8bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      Defs::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      Defs::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      Defs::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Defs::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      Defs::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Defs::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs8bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      Defs::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      Defs::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      Defs::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Defs::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Defs::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Defs::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs8bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      Defs::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      Defs::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      Defs::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Defs::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      Defs::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Defs::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs8bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      Defs::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      Defs::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      Defs::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Defs::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      Defs::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Defs::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs8bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      Defs::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      Defs::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      Defs::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Defs::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      Defs::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Defs::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs8bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      Defs::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      Defs::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      Defs::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Defs::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      Defs::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Defs::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs8bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      Defs::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      Defs::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      Defs::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Defs::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      Defs::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Defs::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs8bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      Defs::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      Defs::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      Defs::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Defs::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      Defs::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Defs::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs8bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      Defs::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      Defs::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      Defs::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Defs::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Defs::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Defs::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs8bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      Defs::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      Defs::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      Defs::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Defs::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      Defs::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Defs::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs8bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      Defs::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      Defs::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      Defs::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Defs::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      Defs::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Defs::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs8bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      Defs::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      Defs::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      Defs::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Defs::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      Defs::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Defs::_64x64::Paeth;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = IntraPredDefs<uint16_t>;
+using Defs10bpp = IntraPredBppDefs<10, uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs10bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs10bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs10bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs10bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs10bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs10bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs10bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs10bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs10bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs10bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs10bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs10bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs10bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs10bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs10bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs10bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs10bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs10bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs10bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      DefsHbd::_64x64::Paeth;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = IntraPredBppDefs<12, uint16_t>;
+
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(DefsHbd, Defs12bpp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs12bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs12bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs12bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs12bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs12bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs12bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs12bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs12bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs12bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs12bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs12bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs12bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs12bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs12bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs12bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs12bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs12bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs12bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs12bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      DefsHbd::_64x64::Paeth;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+#undef INIT_INTRAPREDICTORS_WxH
+#undef INIT_INTRAPREDICTORS
+}  // namespace
+
+void IntraPredInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h
new file mode 100644 (file)
index 0000000..2cb625d
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors. This function is not thread-safe.
+void IntraPredInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_H_
diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc
new file mode 100644 (file)
index 0000000..798bb73
--- /dev/null
@@ -0,0 +1,915 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+    kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+    kTransformSize64x32, kTransformSize64x64};
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<Pixel*>(dest);
+  const int dc = dst[0];
+  stride /= sizeof(Pixel);
+  const int max_value = (1 << bitdepth) - 1;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+      assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+      dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+                     0, max_value);
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+          int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+                     const int max_luma_width, const int max_luma_height,
+                     const void* LIBGAV1_RESTRICT const source,
+                     ptrdiff_t stride) {
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int sum = 0;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      const ptrdiff_t luma_x =
+          std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+      const ptrdiff_t luma_x_next = luma_x + stride;
+      luma[y][x] =
+          (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+           ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+                                 : 0))
+          << (3 - subsampling_x - subsampling_y);
+      sum += luma[y][x];
+    }
+    if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+      src += stride << subsampling_y;
+    }
+  }
+  const int average = RightShiftWithRounding(
+      sum, FloorLog2(block_width) + FloorLog2(block_height));
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      luma[y][x] -= average;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes dsp entries for kTransformSize|W|x|H|.
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL)             \
+  dsp->cfl_intra_predictors[kTransformSize##W##x##H] =                 \
+      CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>;                      \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL)       \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(12, uint16_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+
+}  // namespace
+
+void IntraPredCflInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_cfl.h b/src/dsp/intrapred_cfl.h
new file mode 100644 (file)
index 0000000..4e8a11f
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_cfl_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_cfl_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers.
+// This function is not thread-safe.
+void IntraPredCflInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc
new file mode 100644 (file)
index 0000000..53f3075
--- /dev/null
@@ -0,0 +1,1167 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kCflIntraPredName = "kCflIntraPredictor";
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// CflIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  CflIntraPredTest() = default;
+  CflIntraPredTest(const CflIntraPredTest&) = delete;
+  CflIntraPredTest& operator=(const CflIntraPredTest&) = delete;
+  ~CflIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredCflInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_cfl_intra_pred_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredCflInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      IntraPredCflInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+    if (cur_cfl_intra_pred_ == base_cfl_intra_pred_) {
+      cur_cfl_intra_pred_ = nullptr;
+    }
+  }
+
+  // This test modifies intra_pred_mem_.
+  void TestSpeed(const char* digest, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  CflIntraPredictorFunc base_cfl_intra_pred_;
+  CflIntraPredictorFunc cur_cfl_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+                                                  const int num_runs) {
+  if (cur_cfl_intra_pred_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+  const int alpha = rnd(33) - 16;
+  const int dc = rnd(1 << bitdepth);
+  const int max_luma = ((1 << bitdepth) - 1) << 3;
+  for (int i = 0; i < block_height_; ++i) {
+    for (int j = 0; j < block_width_; ++j) {
+      if (i < kCflLumaBufferStride && j < kCflLumaBufferStride) {
+        luma[i][j] = max_luma - rnd(max_luma << 1);
+      }
+    }
+  }
+  for (auto& r : intra_pred_mem_.ref_src) r = dc;
+
+  absl::Duration elapsed_time;
+  for (int run = 0; run < num_runs; ++run) {
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma, alpha);
+    elapsed_time += absl::Now() - start;
+  }
+  test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+                             intra_pred_mem_.dst, sizeof(intra_pred_mem_.dst),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_cfl_intra_pred_ == nullptr) return;
+
+  int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+  for (auto& line : luma_buffer) {
+    for (auto& luma : line) luma = ((1 << bitdepth) - 1) << 3;
+  }
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  static constexpr int kSaturatedAlpha[] = {-16, 16};
+  for (const int alpha : kSaturatedAlpha) {
+    for (auto& r : intra_pred_mem_.ref_src) r = (1 << bitdepth) - 1;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+                    << alpha << " differs from reference.";
+      break;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_cfl_intra_pred_ == nullptr) return;
+  int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+
+  const int max_luma = ((1 << bitdepth) - 1) << 3;
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (auto& line : luma_buffer) {
+    for (auto& luma : line) luma = max_luma - rnd(max_luma << 1);
+  }
+  const int dc = rnd(1 << bitdepth);
+  for (auto& r : intra_pred_mem_.ref_src) r = dc;
+  static constexpr int kSaturatedAlpha[] = {-16, 16};
+  for (const int alpha : kSaturatedAlpha) {
+    intra_pred_mem_.Reset(&rnd);
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+                    << alpha << " differs from reference.";
+      break;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  CflSubsamplerTest() = default;
+  CflSubsamplerTest(const CflSubsamplerTest&) = delete;
+  CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete;
+  ~CflSubsamplerTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredCflInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_cfl_subsampler_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredCflInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      IntraPredCflInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+  }
+
+  // This test modifies intra_pred_mem_.
+  void TestSpeed(const char* digest, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  enum SubsamplingType SubsamplingType() const { return subsampling_type; }
+
+  CflSubsamplerFunc base_cfl_subsampler_;
+  CflSubsamplerFunc cur_cfl_subsampler_;
+};
+
+// There is no case where both source and output have lowest height or width
+// when that dimension is subsampled.
+int GetLumaWidth(int block_width, SubsamplingType subsampling_type) {
+  if (block_width == 4) {
+    const int width_shift =
+        static_cast<int>(subsampling_type != kSubsamplingType444);
+    return block_width << width_shift;
+  }
+  return block_width;
+}
+
+int GetLumaHeight(int block_height, SubsamplingType subsampling_type) {
+  if (block_height == 4) {
+    const int height_shift =
+        static_cast<int>(subsampling_type == kSubsamplingType420);
+    return block_height << height_shift;
+  }
+  return block_height;
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestSpeed(
+    const char* const digest, const int num_runs) {
+  // C declines initializing the table in normal circumstances because there are
+  // assembly implementations.
+  if (cur_cfl_subsampler_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  const int width = GetLumaWidth(block_width_, subsampling_type);
+  const int height = GetLumaHeight(block_height_, subsampling_type);
+  Pixel* src = intra_pred_mem_.ref_src;
+#if LIBGAV1_MSAN
+  // Quiet 10bpp CflSubsampler420_NEON() msan warning.
+  memset(src, 0, sizeof(intra_pred_mem_.ref_src));
+#endif
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      src[j] = rnd.RandRange(1 << bitdepth);
+    }
+    src += kMaxBlockSize;
+  }
+  const absl::Time start = absl::Now();
+  int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  for (int run = 0; run < num_runs; ++run) {
+    cur_cfl_subsampler_(luma, width, height, intra_pred_mem_.ref_src, stride);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+                             luma, sizeof(luma), elapsed_time);
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel,
+                       subsampling_type>::TestSaturatedValues() {
+  if (base_cfl_subsampler_ == nullptr) return;
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+       width -= 8) {
+    for (int height = GetLumaHeight(block_height_, subsampling_type);
+         height > 0; height -= 8) {
+      Pixel* src = intra_pred_mem_.ref_src;
+      for (int y = 0; y < height; ++y) {
+        Memset(src, (1 << bitdepth) - 1, width);
+        Memset(src + width, 0, kMaxBlockSize - width);
+        src += kMaxBlockSize;
+      }
+      Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+             kMaxBlockSize * (kMaxBlockSize - height));
+
+      int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+                           stride);
+      cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+                          stride);
+      if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+                                     reinterpret_cast<uint16_t*>(luma_base[0]),
+                                     block_width_, block_height_,
+                                     kCflLumaBufferStride, kCflLumaBufferStride,
+                                     true)) {
+        FAIL() << "Result from optimized version of CFL subsampler"
+               << " differs from reference. max_luma_width: " << width
+               << " max_luma_height: " << height;
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestRandomValues() {
+  if (base_cfl_subsampler_ == nullptr) return;
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+       width -= 8) {
+    for (int height = GetLumaHeight(block_height_, subsampling_type);
+         height > 0; height -= 8) {
+      Pixel* src = intra_pred_mem_.ref_src;
+      for (int i = 0; i < height; ++i) {
+        for (int j = 0; j < width; ++j) {
+          src[j] = rnd.RandRange(1 << bitdepth);
+        }
+        Memset(src + width, 0, kMaxBlockSize - width);
+        src += kMaxBlockSize;
+      }
+      Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+             kMaxBlockSize * (kMaxBlockSize - height));
+
+      int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+                           stride);
+      cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+                          stride);
+      if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+                                     reinterpret_cast<uint16_t*>(luma_base[0]),
+                                     block_width_, block_height_,
+                                     kCflLumaBufferStride, kCflLumaBufferStride,
+                                     true)) {
+        FAIL() << "Result from optimized version of CFL subsampler"
+               << " differs from reference. max_luma_width: " << width
+               << " max_luma_height: " << height;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest8bpp = CflIntraPredTest<8, uint8_t>;
+
+const char* GetCflIntraPredDigest8bpp(TransformSize tx_size) {
+  static const char* const kDigest4x4 = "9ea7088e082867fd5ae394ca549fe1ed";
+  static const char* const kDigest4x8 = "323b0b4784b6658da781398e61f2da3d";
+  static const char* const kDigest4x16 = "99eb9c65f227ca7f71dcac24645a4fec";
+  static const char* const kDigest8x4 = "e8e782e31c94f3974b87b93d455262d8";
+  static const char* const kDigest8x8 = "23ab9fb65e7bbbdb985709e115115eb5";
+  static const char* const kDigest8x16 = "52f5add2fc4bbb2ff893148645e95b9c";
+  static const char* const kDigest8x32 = "283fdee9af8afdb76f72dd7339c92c3c";
+  static const char* const kDigest16x4 = "eead35f515b1aa8b5175b283192b86e6";
+  static const char* const kDigest16x8 = "5778e934254eaab04230bc370f64f778";
+  static const char* const kDigest16x16 = "4e8ed38ccba0d62f1213171da2212ed3";
+  static const char* const kDigest16x32 = "61a29bd7699e18ca6ea5641d1d023bfd";
+  static const char* const kDigest32x8 = "7f31607bd4f9ec879aa47f4daf9c7bb0";
+  static const char* const kDigest32x16 = "eb84dfab900fa6a90e132b186b4c6c36";
+  static const char* const kDigest32x32 = "e0ff35d407cb214578d61ef419c94237";
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigest4x4;
+    case kTransformSize4x8:
+      return kDigest4x8;
+    case kTransformSize4x16:
+      return kDigest4x16;
+    case kTransformSize8x4:
+      return kDigest8x4;
+    case kTransformSize8x8:
+      return kDigest8x8;
+    case kTransformSize8x16:
+      return kDigest8x16;
+    case kTransformSize8x32:
+      return kDigest8x32;
+    case kTransformSize16x4:
+      return kDigest16x4;
+    case kTransformSize16x8:
+      return kDigest16x8;
+    case kTransformSize16x16:
+      return kDigest16x16;
+    case kTransformSize16x32:
+      return kDigest16x32;
+    case kTransformSize32x8:
+      return kDigest32x8;
+    case kTransformSize32x16:
+      return kDigest32x16;
+    case kTransformSize32x32:
+      return kDigest32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflIntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest8bpp444 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType444>;
+using CflSubsamplerTest8bpp422 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType422>;
+using CflSubsamplerTest8bpp420 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest8bpp(TransformSize tx_size,
+                                       SubsamplingType subsampling_type) {
+  static const char* const kDigests4x4[3] = {
+      "a8fa98d76cc3ccffcffc0d02dfae052c", "929cf2c23d926b500616797f8b1baf5b",
+      "1d03f091956838e7f2b113aabd8b9da9"};
+  static const char* const kDigests4x8[3] = {
+      "717b84f867f413c87c90a7c5d0125c8c", "6ccd9f48842b1a802e128b46b8f4885d",
+      "68a334f5d2abecbc78562b3280b5fb0c"};
+  static const char* const kDigests4x16[3] = {
+      "ecd1340b7e065dd8807fd9861abb7d99", "042c3fee17df7ef8fb8cef616f212a91",
+      "b0600f0bc3fbfc374bb3628360dcae5c"};
+  static const char* const kDigests8x4[3] = {
+      "4ea5617f4ed8e9edc2fff88d0ab8e53f", "b02288905f218c9f54ce4a472ec7b22e",
+      "3522d3a4dd3839d1a86fb39b31a86d52"};
+  static const char* const kDigests8x8[3] = {
+      "a0488493e6bcdb868713a95f9b4a0091", "ff6c1ac1d94fce63c282ba49186529bf",
+      "082e34ba04d04d7cd6fe408823987602"};
+  static const char* const kDigests8x16[3] = {
+      "e01dd4bb21daaa6e991cd5b1e6f30300", "2a1b13f932e39cc5f561afea9956f47a",
+      "d8d266282cb7123f780bd7266e8f5913"};
+  static const char* const kDigests8x32[3] = {
+      "0fc95e4ab798b95ccd2966ff75028b03", "6bc6e45ef2f664134449342fe76006ff",
+      "d294fb6399edaa267aa167407c0ebccb"};
+  static const char* const kDigests16x4[3] = {
+      "4798c2cf649b786bd153ad88353d52aa", "43a4bfa3b8caf4b72f58c6a1d1054f64",
+      "a928ebbec2db1508c8831a440d82eb98"};
+  static const char* const kDigests16x8[3] = {
+      "736b7f5b603cb34abcbe1b7e69b6ce93", "90422000ab20ecb519e4d277a9b3ea2b",
+      "c8e71c2fddbb850c5a50592ee5975368"};
+  static const char* const kDigests16x16[3] = {
+      "4f15a694966ee50a9e987e9a0aa2423b", "9e31e2f5a7ce7bef738b135755e25dcd",
+      "2ffeed4d592a0455f6d888913969827f"};
+  static const char* const kDigests16x32[3] = {
+      "3a10438bfe17ea39efad20608a0520eb", "79e8e8732a6ffc29dfbb0b3fc29c2883",
+      "185ca976ccbef7fb5f3f8c6aa22d5a79"};
+  static const char* const kDigests32x8[3] = {
+      "683704f08839a15e42603e4977a3e815", "13d311635372aee8998fca1758e75e20",
+      "9847d88eaaa57c086a2e6aed583048d3"};
+  static const char* const kDigests32x16[3] = {
+      "14b6761bf9f1156cf2496f532512aa99", "ee57bb7f0aa2302d29cdc1bfce72d5fc",
+      "a4189655fe714b82eb88cb5092c0ad76"};
+  static const char* const kDigests32x32[3] = {
+      "dcfbe71b70a37418ccb90dbf27f04226", "c578556a584019c1bdc2d0c3b9fd0c88",
+      "db200bc8ccbeacd6a42d6b8e5ad1d931"};
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4[subsampling_type];
+    case kTransformSize4x8:
+      return kDigests4x8[subsampling_type];
+    case kTransformSize4x16:
+      return kDigests4x16[subsampling_type];
+    case kTransformSize8x4:
+      return kDigests8x4[subsampling_type];
+    case kTransformSize8x8:
+      return kDigests8x8[subsampling_type];
+    case kTransformSize8x16:
+      return kDigests8x16[subsampling_type];
+    case kTransformSize8x32:
+      return kDigests8x32[subsampling_type];
+    case kTransformSize16x4:
+      return kDigests16x4[subsampling_type];
+    case kTransformSize16x8:
+      return kDigests16x8[subsampling_type];
+    case kTransformSize16x16:
+      return kDigests16x16[subsampling_type];
+    case kTransformSize16x32:
+      return kDigests16x32[subsampling_type];
+    case kTransformSize32x8:
+      return kDigests32x8[subsampling_type];
+    case kTransformSize32x16:
+      return kDigests32x16[subsampling_type];
+    case kTransformSize32x32:
+      return kDigests32x32[subsampling_type];
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflSubsamplerTest8bpp444, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>;
+
+const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) {
+  static const char* const kDigest4x4 = "b4ca5f6fbb643a94eb05d59976d44c5d";
+  static const char* const kDigest4x8 = "040139b76ee22af05c56baf887d3d43b";
+  static const char* const kDigest4x16 = "4a1d59ace84ff07e68a0d30e9b1cebdd";
+  static const char* const kDigest8x4 = "c2c149cea5fdcd18bfe5c19ec2a8aa90";
+  static const char* const kDigest8x8 = "68ad90bd6f409548fa5551496b7cb0d0";
+  static const char* const kDigest8x16 = "bdc54eff4de8c5d597b03afaa705d3fe";
+  static const char* const kDigest8x32 = "362aebc6d68ff0d312d55dcd6a8a927d";
+  static const char* const kDigest16x4 = "349e813aedd211581c5e64ba1938eaa7";
+  static const char* const kDigest16x8 = "35c64f6da17f836618b5804185cf3eef";
+  static const char* const kDigest16x16 = "95be0c78dbd8dda793c62c6635b4bfb7";
+  static const char* const kDigest16x32 = "4752b9eda069854d3f5c56d3f2057e79";
+  static const char* const kDigest32x8 = "dafc5e973e4b6a55861f4586a11b7dd1";
+  static const char* const kDigest32x16 = "1e177ed3914a165183916aca1d01bb74";
+  static const char* const kDigest32x32 = "4c9ab3cf9baa27bb34e29729dabc1ea6";
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigest4x4;
+    case kTransformSize4x8:
+      return kDigest4x8;
+    case kTransformSize4x16:
+      return kDigest4x16;
+    case kTransformSize8x4:
+      return kDigest8x4;
+    case kTransformSize8x8:
+      return kDigest8x8;
+    case kTransformSize8x16:
+      return kDigest8x16;
+    case kTransformSize8x32:
+      return kDigest8x32;
+    case kTransformSize16x4:
+      return kDigest16x4;
+    case kTransformSize16x8:
+      return kDigest16x8;
+    case kTransformSize16x16:
+      return kDigest16x16;
+    case kTransformSize16x32:
+      return kDigest16x32;
+    case kTransformSize32x8:
+      return kDigest32x8;
+    case kTransformSize32x16:
+      return kDigest32x16;
+    case kTransformSize32x32:
+      return kDigest32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflIntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest10bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest10bpp444 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest10bpp422 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest10bpp420 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest10bpp(TransformSize tx_size,
+                                        SubsamplingType subsampling_type) {
+  static const char* const kDigests4x4[3] = {
+      "a8abcad9a6c9b046a100689135a108cb", "01081c2a0d0c15dabdbc725be5660451",
+      "93d1d9df2861240d88f5618e42178654"};
+  static const char* const kDigests4x8[3] = {
+      "d1fd8cd0709ca6634ad85f3e331672e1", "0d603fcc910aca3db41fc7f64e826c27",
+      "cf88b6d1b7b025cfa0082361775aeb75"};
+  static const char* const kDigests4x16[3] = {
+      "ce2e036a950388a564d8637b1416a6c6", "6c36c46cd72057a6b36bc12188b6d22c",
+      "0884a0e53384cd5173035ad8966d8f2f"};
+  static const char* const kDigests8x4[3] = {
+      "174e961983ed71fb105ed71aa3f9daf5", "330946cc369a534618a1014b4e3f6f18",
+      "8070668aa389c1d09f8aaf43c1223e8c"};
+  static const char* const kDigests8x8[3] = {
+      "86884feb35217010f73ccdbadecb635e", "b8cbc646e1bf1352e5b4b599eaef1193",
+      "4a1110382e56b42d3b7a4132bccc01ee"};
+  static const char* const kDigests8x16[3] = {
+      "a694c4e1f89648ffb49efd6a1d35b300", "864b9da67d23a2f8284b28b2a1e5aa30",
+      "bd012ca1cea256dd02c231339a4cf200"};
+  static const char* const kDigests8x32[3] = {
+      "60c42201bc24e518c1a3b3b6306d8125", "4d530e47c2b7555d5f311ee910d61842",
+      "71888b17b832ef55c0cd9449c0e6b077"};
+  static const char* const kDigests16x4[3] = {
+      "6b6d5ae4cc294c070ce65ab31c5a7d4f", "0fbecee20d294939e7a0183c2b4a0b96",
+      "917cd884923139d5c05a11000722e3b6"};
+  static const char* const kDigests16x8[3] = {
+      "688c41726d9ac35fb5b18c57bca76b9c", "d439a2e0a60d672b644cd1189e2858b9",
+      "edded6d166a77a6c3ff46fddc13f372f"};
+  static const char* const kDigests16x16[3] = {
+      "feb2bad9f6bb3f60eaeaf6c1bfd89ca5", "d65cabce5fcd9a29d1dfc530e4764f3a",
+      "2f1a91898812d2c9320c7506b3a72eb4"};
+  static const char* const kDigests16x32[3] = {
+      "6f23b1851444d29633e62ce77bf09559", "4a449fd078bd0c9657cdc24b709c0796",
+      "e44e18cb8bda2d34b52c96d5b6b510be"};
+  static const char* const kDigests32x8[3] = {
+      "77bf9ba56f7e1d2f04068a8a00b139da", "a85a1dea82963dedab9a2f7ad4169b5f",
+      "d12746071bee96ddc075c6368bc9fbaf"};
+  static const char* const kDigests32x16[3] = {
+      "cce3422f7f8cf57145f979359ac92f98", "1c18738d40bfa91296e5fdb7230bf9a7",
+      "02513142d109aee10f081cacfb33d1c5"};
+  static const char* const kDigests32x32[3] = {
+      "789008e49d0276de186af968196dd4a7", "b8848b00968a7ba4787765b7214da05f",
+      "12d13828db57605b00ce99469489651d"};
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4[subsampling_type];
+    case kTransformSize4x8:
+      return kDigests4x8[subsampling_type];
+    case kTransformSize4x16:
+      return kDigests4x16[subsampling_type];
+    case kTransformSize8x4:
+      return kDigests8x4[subsampling_type];
+    case kTransformSize8x8:
+      return kDigests8x8[subsampling_type];
+    case kTransformSize8x16:
+      return kDigests8x16[subsampling_type];
+    case kTransformSize8x32:
+      return kDigests8x32[subsampling_type];
+    case kTransformSize16x4:
+      return kDigests16x4[subsampling_type];
+    case kTransformSize16x8:
+      return kDigests16x8[subsampling_type];
+    case kTransformSize16x16:
+      return kDigests16x16[subsampling_type];
+    case kTransformSize16x32:
+      return kDigests16x32[subsampling_type];
+    case kTransformSize32x8:
+      return kDigests32x8[subsampling_type];
+    case kTransformSize32x16:
+      return kDigests32x16[subsampling_type];
+    case kTransformSize32x32:
+      return kDigests32x32[subsampling_type];
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflSubsamplerTest10bpp444, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CflIntraPredTest12bpp = CflIntraPredTest<12, uint16_t>;
+
+const char* GetCflIntraPredDigest12bpp(TransformSize tx_size) {
+  static const char* const kDigest4x4 = "1d92a681a58f99396f22acd8b3154e2b";
+  static const char* const kDigest4x8 = "cf6833ebc64c9ae45f192ee384ef4aa3";
+  static const char* const kDigest4x16 = "06a4fbb8590aca98a045c902ed15c777";
+  static const char* const kDigest8x4 = "ad5944c7455f731ae8dd28b2b25a1b9f";
+  static const char* const kDigest8x8 = "c19621e42ca2bc184d5065131d27be2c";
+  static const char* const kDigest8x16 = "8faa7c95e8c3c18621168ed6759c1ac1";
+  static const char* const kDigest8x32 = "502699ef7a8c7aebc8c3bc653e733703";
+  static const char* const kDigest16x4 = "7f30bb038217967336fb8548a6f7df45";
+  static const char* const kDigest16x8 = "b70943098d0fb256c2943e2ebdbe6d34";
+  static const char* const kDigest16x16 = "4c34f5669880ab78d648b16b68ea0c24";
+  static const char* const kDigest16x32 = "5d85daf690020ed235617870a1a179b1";
+  static const char* const kDigest32x8 = "f8eec12e58c469ffb698fc60b13b927c";
+  static const char* const kDigest32x16 = "f272bb7e5d2df333aa63d806c95e6748";
+  static const char* const kDigest32x32 = "c737987c0a5414b03e6014f145dd999c";
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigest4x4;
+    case kTransformSize4x8:
+      return kDigest4x8;
+    case kTransformSize4x16:
+      return kDigest4x16;
+    case kTransformSize8x4:
+      return kDigest8x4;
+    case kTransformSize8x8:
+      return kDigest8x8;
+    case kTransformSize8x16:
+      return kDigest8x16;
+    case kTransformSize8x32:
+      return kDigest8x32;
+    case kTransformSize16x4:
+      return kDigest16x4;
+    case kTransformSize16x8:
+      return kDigest16x8;
+    case kTransformSize16x16:
+      return kDigest16x16;
+    case kTransformSize16x32:
+      return kDigest16x32;
+    case kTransformSize32x8:
+      return kDigest32x8;
+    case kTransformSize32x16:
+      return kDigest32x16;
+    case kTransformSize32x32:
+      return kDigest32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflIntraPredTest12bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest12bpp, FixedInput) {
+  TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest12bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest12bpp444 =
+    CflSubsamplerTest<12, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest12bpp422 =
+    CflSubsamplerTest<12, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest12bpp420 =
+    CflSubsamplerTest<12, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest12bpp(TransformSize tx_size,
+                                        SubsamplingType subsampling_type) {
+  static const char* const kDigests4x4[3] = {
+      "44af37c60e9ccaacea004b57d5dea4cf",
+      "e29dd1d93f23b23778ed8cd85910d987",
+      "81e5dac2fd4c90f872ab814ed0f76ae5",
+  };
+  static const char* const kDigests4x8[3] = {
+      "bfc04aed9fe41ec07b0462a219652d16",
+      "693dd064636a0aa3be7aa098e867c512",
+      "0636c25d88aacd85d63e56011e7c5d15",
+  };
+  static const char* const kDigests4x16[3] = {
+      "6479ab30377288e75a78068d47c7e194",
+      "7d6f9b8b3eb85e73626118fc9210e622",
+      "1f3d474cd7c86899da90e515b8b7a906",
+  };
+  static const char* const kDigests8x4[3] = {
+      "7da5a2029bcdab159225c475fdff02da",
+      "096bfef24caa0670d2cd7b0bb63a7ba6",
+      "f749310dfc8a6129ed438dbc845470c0",
+  };
+  static const char* const kDigests8x8[3] = {
+      "08494051a7ff50718313a79ec7c51f92",
+      "637efad0630e253f7cce11af1a0af456",
+      "b220faf7dfedef860d59079dcf201757",
+  };
+  static const char* const kDigests8x16[3] = {
+      "19f027af516e88d3b9e613e578deb126",
+      "4f3bb155d70f9ea76d05b2f41b297a0c",
+      "b7504347eeda1e59ba8e36385c219e40",
+  };
+  static const char* const kDigests8x32[3] = {
+      "b8f1ef01c5672c87ee1004bb3cd7b8bc",
+      "b3e3318b050eb1c165d1e320ef622fa7",
+      "67754f7c5ae84dc23bb76ffaa2fa848e",
+  };
+  static const char* const kDigests16x4[3] = {
+      "f687fb4e22d8a1446eeb4915036874f4",
+      "7b5ef3d393a98dfe0ba49a0db2083465",
+      "840bbb6edaa50e9f7d391033a3dda2d9",
+  };
+  static const char* const kDigests16x8[3] = {
+      "dd9aed11d115a028035f0cee5b90d433",
+      "340d5d0784356ea199d3d751f4d6ed5e",
+      "e55f6fb5f34d829727e9dc2068098933",
+  };
+  static const char* const kDigests16x16[3] = {
+      "1df36a20d76a405c6273b88b38693cf9",
+      "2a7590d01df60b4bc6f10bfdb07b7a65",
+      "510ee31a5bd609e8f4542bb817539668",
+  };
+  static const char* const kDigests16x32[3] = {
+      "bdbc13b9fb7c3c50d25fda57f86f5ad9",
+      "7c138c568794b3d0c8aabff2edc07efd",
+      "581bef267c2a66e4c2fb079968440dbe",
+  };
+  static const char* const kDigests32x8[3] = {
+      "26f62743793811475e2afe1414c5fee1",
+      "6e6bf1678a04f2f727f0679564fb3630",
+      "a4c15562c26dbcfa43fe03a2b6e728b5",
+  };
+  static const char* const kDigests32x16[3] = {
+      "791f0713bbf032081da8ec08e58b9cd3",
+      "5dc7a673e92767186ae86996f4a30691",
+      "651f09d1244c817d92d1baa094c86f56",
+  };
+  static const char* const kDigests32x32[3] = {
+      "543a9d76e7238d88ba86218ec47c1f49",
+      "b0f2b29aae4858c1f09c27fc4344fd15",
+      "1d45083875fed14c4e5f149384a3cd2d",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4[subsampling_type];
+    case kTransformSize4x8:
+      return kDigests4x8[subsampling_type];
+    case kTransformSize4x16:
+      return kDigests4x16[subsampling_type];
+    case kTransformSize8x4:
+      return kDigests8x4[subsampling_type];
+    case kTransformSize8x8:
+      return kDigests8x8[subsampling_type];
+    case kTransformSize8x16:
+      return kDigests8x16[subsampling_type];
+    case kTransformSize8x32:
+      return kDigests8x32[subsampling_type];
+    case kTransformSize16x4:
+      return kDigests16x4[subsampling_type];
+    case kTransformSize16x8:
+      return kDigests16x8[subsampling_type];
+    case kTransformSize16x16:
+      return kDigests16x16[subsampling_type];
+    case kTransformSize16x32:
+      return kDigests16x32[subsampling_type];
+    case kTransformSize32x8:
+      return kDigests32x8[subsampling_type];
+    case kTransformSize32x16:
+      return kDigests32x16[subsampling_type];
+    case kTransformSize32x32:
+      return kDigests32x32[subsampling_type];
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflSubsamplerTest12bpp444, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp444, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest12bpp422, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp422, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest12bpp420, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp420, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp420, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+// Cfl predictors are available only for transform sizes with
+// max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+    kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp422,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp422,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest12bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp422,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc
new file mode 100644 (file)
index 0000000..9146074
--- /dev/null
@@ -0,0 +1,290 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+
+  // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+  // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+  if (xstep == 64) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+    // |predictor_angle| is 45 the delta is also 45.
+    assert(!upsampled_top);
+    const Pixel* top_ptr = top + 1;
+    for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+      memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+    }
+    return;
+  }
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        Memset(dst, top[max_base_x], width);
+        dst += stride;
+      }
+      return;
+    }
+
+    const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+    int x = 0;
+    do {
+      if (top_base_x >= max_base_x) {
+        Memset(dst + x, top[max_base_x], width - x);
+        break;
+      }
+
+      const int val =
+          top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/);
+      top_base_x += base_step;
+    } while (++x < width);
+
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+// clang 14.0.0 produces incorrect code with LIBGAV1_RESTRICT.
+// https://github.com/llvm/llvm-project/issues/54427
+#if defined(__clang__) && __clang_major__ == 14
+#define LOCAL_RESTRICT
+#else
+#define LOCAL_RESTRICT LIBGAV1_RESTRICT
+#endif
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(
+    void* LOCAL_RESTRICT const dest, ptrdiff_t stride,
+    const void* LOCAL_RESTRICT const top_row,
+    const void* LOCAL_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+  assert(ystep > 0);
+
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int scale_bits_x = 6 - upsample_top_shift;
+  const int scale_bits_y = 6 - upsample_left_shift;
+  const int min_base_x = -(1 << upsample_top_shift);
+  const int base_step_x = 1 << upsample_top_shift;
+  int y = 0;
+  int top_x = -xstep;
+  do {
+    int top_base_x = top_x >> scale_bits_x;
+    int left_y = (y << 6) - ystep;
+    int x = 0;
+    do {
+      int val;
+      if (top_base_x >= min_base_x) {
+        const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+        val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      } else {
+        // Note this assumes an arithmetic shift to handle negative values.
+        const int left_base_y = left_y >> scale_bits_y;
+        const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+        assert(left_base_y >= -(1 << upsample_left_shift));
+        val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      }
+      dst[x] = RightShiftWithRounding(val, 5);
+      top_base_x += base_step_x;
+      left_y -= ystep;
+    } while (++x < width);
+
+    top_x -= xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+#undef LOCAL_RESTRICT
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  stride /= sizeof(Pixel);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             base_step * (height - 1));  // left_base_y
+
+  int left_y = ystep;
+  int x = 0;
+  do {
+    auto* dst = static_cast<Pixel*>(dest);
+
+    int left_base_y = left_y >> scale_bits;
+    int y = 0;
+    do {
+      const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+      const int val =
+          left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5);
+      dst += stride;
+      left_base_y += base_step;
+    } while (++y < height);
+
+    left_y += ystep;
+  } while (++x < width);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void IntraPredDirectionalInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_directional.h b/src/dsp/intrapred_directional.h
new file mode 100644 (file)
index 0000000..bcd1bc1
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_directional_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_directional_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*. This function is not
+// thread-safe.
+void IntraPredDirectionalInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc
new file mode 100644 (file)
index 0000000..2c81b27
--- /dev/null
@@ -0,0 +1,1125 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+constexpr int kNumDirectionalIntraPredictors = 3;
+
+constexpr int kBaseAngles[] = {45, 67, 90, 113, 135, 157, 180, 203};
+
+const char* const kDirectionalPredNames[kNumDirectionalIntraPredictors] = {
+    "kDirectionalIntraPredictorZone1", "kDirectionalIntraPredictorZone2",
+    "kDirectionalIntraPredictorZone3"};
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+  EXPECT_GE(angle, 3);
+  EXPECT_LE(angle, 87);
+  return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+#if LIBGAV1_MSAN
+      // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+      // assembly code (safely) overreading to fill a register.
+      memset(left_mem, 0, sizeof(left_mem));
+      memset(top_mem, 0, sizeof(top_mem));
+#endif  // LIBGAV1_MSAN
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+#if LIBGAV1_MSAN
+      // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+      // assembly code (safely) overreading to fill a register.
+      memset(left_mem, 0, sizeof(left_mem));
+      memset(top_mem, 0, sizeof(top_mem));
+#endif  // LIBGAV1_MSAN
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// DirectionalIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  DirectionalIntraPredTest() = default;
+  DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete;
+  DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete;
+  ~DirectionalIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  enum Zone { kZone1, kZone2, kZone3, kNumZones };
+
+  enum { kAngleDeltaStart = -9, kAngleDeltaStop = 9, kAngleDeltaStep = 3 };
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredDirectionalInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+    base_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+    base_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_directional_intra_pred_zone1_ = nullptr;
+      base_directional_intra_pred_zone2_ = nullptr;
+      base_directional_intra_pred_zone3_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredDirectionalInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      IntraPredDirectionalInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+    cur_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+    cur_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+    // Skip functions that haven't been specialized for this particular
+    // architecture.
+    if (cur_directional_intra_pred_zone1_ ==
+        base_directional_intra_pred_zone1_) {
+      cur_directional_intra_pred_zone1_ = nullptr;
+    }
+    if (cur_directional_intra_pred_zone2_ ==
+        base_directional_intra_pred_zone2_) {
+      cur_directional_intra_pred_zone2_ = nullptr;
+    }
+    if (cur_directional_intra_pred_zone3_ ==
+        base_directional_intra_pred_zone3_) {
+      cur_directional_intra_pred_zone3_ = nullptr;
+    }
+  }
+
+  bool IsEdgeUpsampled(int delta, const int filter_type) const {
+    delta = std::abs(delta);
+    if (delta == 0 || delta >= 40) return false;
+    const int block_wh = block_width_ + block_height_;
+    return (filter_type == 1) ? block_wh <= 8 : block_wh <= 16;
+  }
+
+  // Returns the minimum and maximum (exclusive) range of angles that the
+  // predictor should be applied to.
+  void GetZoneAngleRange(const Zone zone, int* const min_angle,
+                         int* const max_angle) const {
+    ASSERT_NE(min_angle, nullptr);
+    ASSERT_NE(max_angle, nullptr);
+    switch (zone) {
+        // The overall minimum angle comes from mode D45_PRED, yielding:
+        // min_angle = 45-(MAX_ANGLE_DELTA*ANGLE_STEP) = 36
+        // The overall maximum angle comes from mode D203_PRED, yielding:
+        // max_angle = 203+(MAX_ANGLE_DELTA*ANGLE_STEP) = 212
+        // The angles 180 and 90 are not permitted because they correspond to
+        // V_PRED and H_PRED, which are handled in distinct functions.
+      case kZone1:
+        *min_angle = 36;
+        *max_angle = 87;
+        break;
+      case kZone2:
+        *min_angle = 93;
+        *max_angle = 177;
+        break;
+      case kZone3:
+        *min_angle = 183;
+        *max_angle = 212;
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << zone;
+        break;
+    }
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumDirectionalIntraPredictors],
+                 Zone zone, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  DirectionalIntraPredictorZone1Func base_directional_intra_pred_zone1_;
+  DirectionalIntraPredictorZone2Func base_directional_intra_pred_zone2_;
+  DirectionalIntraPredictorZone3Func base_directional_intra_pred_zone3_;
+  DirectionalIntraPredictorZone1Func cur_directional_intra_pred_zone1_;
+  DirectionalIntraPredictorZone2Func cur_directional_intra_pred_zone2_;
+  DirectionalIntraPredictorZone3Func cur_directional_intra_pred_zone3_;
+};
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumDirectionalIntraPredictors], const Zone zone,
+    const int num_runs) {
+  switch (zone) {
+    case kZone1:
+      if (cur_directional_intra_pred_zone1_ == nullptr) return;
+      break;
+    case kZone2:
+      if (cur_directional_intra_pred_zone2_ == nullptr) return;
+      break;
+    case kZone3:
+      if (cur_directional_intra_pred_zone3_ == nullptr) return;
+      break;
+    case kNumZones:
+      FAIL() << "Invalid zone value: " << zone;
+      break;
+  }
+  ASSERT_NE(digests, nullptr);
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  // Allocate separate blocks for each angle + filter + upsampled combination.
+  // Add a 1 pixel right border to test for overwrites.
+  static constexpr int kMaxZoneAngles = 27;  // zone 2
+  static constexpr int kMaxFilterTypes = 2;
+  static constexpr int kBlockBorder = 1;
+  static constexpr int kBorderSize =
+      kBlockBorder * kMaxZoneAngles * kMaxFilterTypes;
+  const int ref_stride =
+      kMaxZoneAngles * kMaxFilterTypes * block_width_ + kBorderSize;
+  const size_t ref_alloc_size = sizeof(Pixel) * ref_stride * block_height_;
+
+  using AlignedPtr = std::unique_ptr<Pixel[], decltype(&AlignedFree)>;
+  AlignedPtr ref_src(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+                     &AlignedFree);
+  AlignedPtr dest(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+                  &AlignedFree);
+  ASSERT_NE(ref_src, nullptr);
+  ASSERT_NE(dest, nullptr);
+
+  const int mask = (1 << bitdepth) - 1;
+  for (size_t i = 0; i < ref_alloc_size / sizeof(ref_src[0]); ++i) {
+    ref_src[i] = rnd.Rand16() & mask;
+  }
+
+  int min_angle = 0, max_angle = 0;
+  ASSERT_NO_FATAL_FAILURE(GetZoneAngleRange(zone, &min_angle, &max_angle));
+
+  absl::Duration elapsed_time;
+  for (int run = 0; run < num_runs; ++run) {
+    Pixel* dst = dest.get();
+    memcpy(dst, ref_src.get(), ref_alloc_size);
+    for (const auto& base_angle : kBaseAngles) {
+      for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+        for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+             angle_delta += kAngleDeltaStep) {
+          const int predictor_angle = base_angle + angle_delta;
+          if (predictor_angle < min_angle || predictor_angle > max_angle) {
+            continue;
+          }
+
+          ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                        << " angle_delta: " << angle_delta;
+          const bool upsampled_left =
+              IsEdgeUpsampled(predictor_angle - 180, filter_type);
+          const bool upsampled_top =
+              IsEdgeUpsampled(predictor_angle - 90, filter_type);
+          const ptrdiff_t stride = ref_stride * sizeof(ref_src[0]);
+          if (predictor_angle < 90) {
+            ASSERT_EQ(zone, kZone1);
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone1_(dst, stride, top, block_width_,
+                                              block_height_, xstep,
+                                              upsampled_top);
+            elapsed_time += absl::Now() - start;
+          } else if (predictor_angle < 180) {
+            ASSERT_EQ(zone, kZone2);
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone2_(
+                dst, stride, top, left, block_width_, block_height_, xstep,
+                ystep, upsampled_top, upsampled_left);
+            elapsed_time += absl::Now() - start;
+          } else {
+            ASSERT_EQ(zone, kZone3);
+            ASSERT_LT(predictor_angle, 270);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone3_(dst, stride, left, block_width_,
+                                              block_height_, ystep,
+                                              upsampled_left);
+            elapsed_time += absl::Now() - start;
+          }
+          dst += block_width_ + kBlockBorder;
+        }
+      }
+    }
+  }
+
+  test_utils::CheckMd5Digest(ToString(tx_size_), kDirectionalPredNames[zone],
+                             digests[zone], dest.get(), ref_alloc_size,
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  for (int i = kZone1; i < kNumZones; ++i) {
+    switch (i) {
+      case kZone1:
+        if (cur_directional_intra_pred_zone1_ == nullptr) continue;
+        break;
+      case kZone2:
+        if (cur_directional_intra_pred_zone2_ == nullptr) continue;
+        break;
+      case kZone3:
+        if (cur_directional_intra_pred_zone3_ == nullptr) continue;
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << i;
+        break;
+    }
+    int min_angle = 0, max_angle = 0;
+    ASSERT_NO_FATAL_FAILURE(
+        GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+    for (const auto& base_angle : kBaseAngles) {
+      for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+        for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+             angle_delta += kAngleDeltaStep) {
+          const int predictor_angle = base_angle + angle_delta;
+          if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+            continue;
+          }
+          ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                        << " angle_delta: " << angle_delta;
+
+          memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                 sizeof(intra_pred_mem_.dst));
+
+          const bool upsampled_left =
+              IsEdgeUpsampled(predictor_angle - 180, filter_type);
+          const bool upsampled_top =
+              IsEdgeUpsampled(predictor_angle - 90, filter_type);
+          const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+          if (predictor_angle < 90) {
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle);
+            cur_directional_intra_pred_zone1_(intra_pred_mem_.dst, stride, top,
+                                              block_width_, block_height_,
+                                              xstep, upsampled_top);
+          } else if (predictor_angle < 180) {
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+            cur_directional_intra_pred_zone2_(
+                intra_pred_mem_.dst, stride, top, left, block_width_,
+                block_height_, xstep, ystep, upsampled_top, upsampled_left);
+          } else {
+            ASSERT_LT(predictor_angle, 270);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+            cur_directional_intra_pred_zone3_(intra_pred_mem_.dst, stride, left,
+                                              block_width_, block_height_,
+                                              ystep, upsampled_left);
+          }
+
+          if (!test_utils::CompareBlocks(
+                  intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+                  block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+            ADD_FAILURE() << "Expected " << kDirectionalPredNames[i]
+                          << " (angle: " << predictor_angle
+                          << " filter type: " << filter_type
+                          << ") to produce a block containing '"
+                          << static_cast<int>(kMaxPixel) << "'";
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+
+  for (int i = kZone1; i < kNumZones; ++i) {
+    // Only run when there is a reference version (base) and a different
+    // optimized version (cur).
+    switch (i) {
+      case kZone1:
+        if (base_directional_intra_pred_zone1_ == nullptr ||
+            cur_directional_intra_pred_zone1_ == nullptr) {
+          continue;
+        }
+        break;
+      case kZone2:
+        if (base_directional_intra_pred_zone2_ == nullptr ||
+            cur_directional_intra_pred_zone2_ == nullptr) {
+          continue;
+        }
+        break;
+      case kZone3:
+        if (base_directional_intra_pred_zone3_ == nullptr ||
+            cur_directional_intra_pred_zone3_ == nullptr) {
+          continue;
+        }
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << i;
+        break;
+    }
+    int min_angle = 0, max_angle = 0;
+    ASSERT_NO_FATAL_FAILURE(
+        GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+    for (const auto& base_angle : kBaseAngles) {
+      for (int n = 0; n < 1000; ++n) {
+        for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+          for (int angle_delta = kAngleDeltaStart;
+               angle_delta <= kAngleDeltaStop; angle_delta += kAngleDeltaStep) {
+            const int predictor_angle = base_angle + angle_delta;
+            if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+              continue;
+            }
+            ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                          << " angle_delta: " << angle_delta;
+
+            intra_pred_mem_.Reset(&rnd);
+            memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                   sizeof(intra_pred_mem_.dst));
+
+            const bool upsampled_left =
+                IsEdgeUpsampled(predictor_angle - 180, filter_type);
+            const bool upsampled_top =
+                IsEdgeUpsampled(predictor_angle - 90, filter_type);
+            const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+            if (predictor_angle < 90) {
+              const int xstep =
+                  GetDirectionalIntraPredictorDerivative(predictor_angle);
+              base_directional_intra_pred_zone1_(
+                  intra_pred_mem_.ref_src, stride, top, block_width_,
+                  block_height_, xstep, upsampled_top);
+              cur_directional_intra_pred_zone1_(
+                  intra_pred_mem_.dst, stride, top, block_width_, block_height_,
+                  xstep, upsampled_top);
+            } else if (predictor_angle < 180) {
+              const int xstep =
+                  GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+              const int ystep =
+                  GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+              base_directional_intra_pred_zone2_(
+                  intra_pred_mem_.ref_src, stride, top, left, block_width_,
+                  block_height_, xstep, ystep, upsampled_top, upsampled_left);
+              cur_directional_intra_pred_zone2_(
+                  intra_pred_mem_.dst, stride, top, left, block_width_,
+                  block_height_, xstep, ystep, upsampled_top, upsampled_left);
+            } else {
+              ASSERT_LT(predictor_angle, 270);
+              const int ystep =
+                  GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+              base_directional_intra_pred_zone3_(
+                  intra_pred_mem_.ref_src, stride, left, block_width_,
+                  block_height_, ystep, upsampled_left);
+              cur_directional_intra_pred_zone3_(
+                  intra_pred_mem_.dst, stride, left, block_width_,
+                  block_height_, ystep, upsampled_left);
+            }
+
+            if (!test_utils::CompareBlocks(
+                    intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+                    block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+              ADD_FAILURE() << "Result from optimized version of "
+                            << kDirectionalPredNames[i]
+                            << " differs from reference at angle "
+                            << predictor_angle << " with filter type "
+                            << filter_type << " in iteration #" << n;
+              return;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+using DirectionalIntraPredTest8bpp = DirectionalIntraPredTest<8, uint8_t>;
+
+const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+      "9cfc1da729ad08682e165826c29b280b",
+      "bb73539c7afbda7bddd2184723b932d6",
+      "9d2882800ffe948196e984a26a2da72c",
+  };
+  static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+      "090efe6f83cc6fa301f65d3bbd5c38d2",
+      "d0fba4cdfb90f8bd293a94cae9db1a15",
+      "f7ad0eeab4389d0baa485d30fec87617",
+  };
+  static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+      "1d32b33c75fe85248c48cdc8caa78d84",
+      "7000e18159443d366129a6cc6ef8fcee",
+      "06c02fac5f8575f687abb3f634eb0b4c",
+  };
+  static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+      "1b591799685bc135982114b731293f78",
+      "5cd9099acb9f7b2618dafa6712666580",
+      "d023883efede88f99c19d006044d9fa1",
+  };
+  static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+      "f1e46ecf62a2516852f30c5025adb7ea",
+      "864442a209c16998065af28d8cdd839a",
+      "411a6e554868982af577de69e53f12e8",
+  };
+  static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+      "89278302be913a85cfb06feaea339459",
+      "6c42f1a9493490cd4529fd40729cec3c",
+      "2516b5e1c681e5dcb1acedd5f3d41106",
+  };
+  static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+      "aea7078f3eeaa8afbfe6c959c9e676f1",
+      "cad30babf12729dda5010362223ba65c",
+      "ff384ebdc832007775af418a2aae1463",
+  };
+  static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+      "964a821c313c831e12f4d32e616c0b55",
+      "adf6dad3a84ab4d16c16eea218bec57a",
+      "a54fa008d43895e523474686c48a81c2",
+  };
+  static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+      "fe2851b4e4f9fcf924cf17d50415a4c0",
+      "50a0e279c481437ff315d08eb904c733",
+      "0682065c8fb6cbf9be4949316c87c9e5",
+  };
+  static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+      "ef15503b1943642e7a0bace1616c0e11",
+      "bf1a4d3f855f1072a902a88ec6ce0350",
+      "7e87a03e29cd7fd843fd71b729a18f3f",
+  };
+  static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+      "f7b636615d2e5bf289b5db452a6f188d",
+      "e95858c532c10d00b0ce7a02a02121dd",
+      "34a18ccf58ef490f32268e85ce8c7de4",
+  };
+  static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+      "b250099986c2fab9670748598058846b",
+      "f25d80af4da862a9b6b72979f1e17cb4",
+      "5347dc7bc346733b4887f6c8ad5e0898",
+  };
+  static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+      "72e4c9f8af043b1cb1263490351818ab",
+      "1fc010d2df011b9e4e3d0957107c78df",
+      "f4cbfa3ca941ef08b972a68d7e7bafc4",
+  };
+  static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+      "37e5a1aaf7549d2bce08eece9d20f0f6",
+      "6a2794025d0aca414ab17baa3cf8251a",
+      "63dd37a6efdc91eeefef166c99ce2db1",
+  };
+  static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+      "198aabc958992eb49cceab97d1acb43e",
+      "aee88b6c8bacfcf38799fe338e6c66e7",
+      "01e8f8f96696636f6d79d33951907a16",
+  };
+  static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+      "0611390202c4f90f7add7aec763ded58",
+      "960240c7ceda2ccfac7c90b71460578a",
+      "7e7d97594aab8ad56e8c01c340335607",
+  };
+  static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+      "7e1f567e7fc510757f2d89d638bc826f",
+      "c929d687352ce40a58670be2ce3c8c90",
+      "f6881e6a9ba3c3d3d730b425732656b1",
+  };
+  static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+      "27b4c2a7081d4139f22003ba8b6dfdf2",
+      "301e82740866b9274108a04c872fa848",
+      "98d3aa4fef838f4abf00dac33806659f",
+  };
+  static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+      "b31816db8fade3accfd975b21aa264c7",
+      "2adce01a03b9452633d5830e1a9b4e23",
+      "7b988fadba8b07c36e88d7be6b270494",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+  const auto num_runs = static_cast<int>(2e5 / (block_width_ * block_height_));
+#else
+  const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+              static_cast<Zone>(i), num_runs);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, FixedInput) {
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+              static_cast<Zone>(i), 1);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+      "a683f4d7ccd978737615f61ecb4d638d",
+      "90c94374eaf7e9501f197863937b8639",
+      "0d3969cd081523ac6a906eecc7980c43",
+  };
+  static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+      "c3ffa2979b325644e4a56c882fe27347",
+      "1f61f5ee413a9a3b8d1d93869ec2aee0",
+      "4795ea944779ec4a783408769394d874",
+  };
+  static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+      "45c3282c9aa51024c1d64a40f230aa45",
+      "5cd47dd69f8bd0b15365a0c5cfc0a49a",
+      "06336c507b05f98c1d6a21abc43e6182",
+  };
+  static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+      "7370476ff0abbdc5e92f811b8879c861",
+      "a239a50adb28a4791b52a0dfff3bee06",
+      "4779a17f958a9ca04e8ec08c5aba1d36",
+  };
+  static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+      "305463f346c376594f82aad8304e0362",
+      "0cd481e5bda286c87a645417569fd948",
+      "48c7899dc9b7163b0b1f61b3a2b4b73e",
+  };
+  static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+      "5c18fd5339be90628c82b1fb6af50d5e",
+      "35eaa566ebd3bb7c903cfead5dc9ac78",
+      "9fdb0e790e5965810d02c02713c84071",
+  };
+  static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+      "2168d6cc858c704748b7b343ced2ac3a",
+      "1d3ce273107447faafd2e55877e48ffb",
+      "d344164049d1fe9b65a3ae8764bbbd37",
+  };
+  static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+      "dcef2cf51abe3fe150f388a14c762d30",
+      "6a810b289b1c14f8eab8ca1274e91ecd",
+      "c94da7c11f3fb11963d85c8804fce2d9",
+  };
+  static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+      "50a0d08b0d99b7a574bad2cfb36efc39",
+      "2dcb55874db39da70c8ca1318559f9fe",
+      "6390bcd30ff3bc389ecc0a0952bea531",
+  };
+  static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+      "7146c83c2620935606d49f3cb5876f41",
+      "2318ddf30c070a53c9b9cf199cd1b2c5",
+      "e9042e2124925aa7c1b6110617cb10e8",
+  };
+  static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+      "c970f401de7b7c5bb4e3ad447fcbef8f",
+      "a18cc70730eecdaa31dbcf4306ff490f",
+      "32c1528ad4a576a2210399d6b4ccd46e",
+  };
+  static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+      "00b3f0007da2e5d01380594a3d7162d5",
+      "1971af519e4a18967b7311f93efdd1b8",
+      "e6139769ce5a9c4982cfab9363004516",
+  };
+  static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+      "08107ad971179cc9f465ae5966bd4901",
+      "b215212a3c0dfe9182c4f2e903d731f7",
+      "791274416a0da87c674e1ae318b3ce09",
+  };
+  static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+      "94ea6cccae35b5d08799aa003ac08ccf",
+      "ae105e20e63fb55d4fd9d9e59dc62dde",
+      "973d0b2358ea585e4f486e7e645c5310",
+  };
+  static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+      "d14c695c4853ddf5e5d8256bc1d1ed60",
+      "6bd0ebeb53adecc11442b1218b870cb7",
+      "e03bc402a9999aba8272275dce93e89f",
+  };
+  static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+      "b21a8a8723758392ee659eeeae518a1e",
+      "e50285454896210ce44d6f04dfde05a7",
+      "f0f8ea0c6c2acc8d7d390927c3a90370",
+  };
+  static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+      "ce51db16fd4fa56e601631397b098c89",
+      "aa87a8635e02c1e91d13158c61e443f6",
+      "4c1ee3afd46ef34bd711a34d0bf86f13",
+  };
+  static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+      "25aaf5971e24e543e3e69a47254af777",
+      "eb6f444b3df127d69460778ab5bf8fc1",
+      "2f846cc0d506f90c0a58438600819817",
+  };
+  static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+      "b26ce5b5f4b5d4a438b52e5987877fb8",
+      "35721a00a70938111939cf69988d928e",
+      "0af7ec35939483fac82c246a13845806",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+  const int num_runs = static_cast<int>(2e5 / (block_width_ * block_height_));
+#else
+  const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+              static_cast<Zone>(i), num_runs);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, FixedInput) {
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+              static_cast<Zone>(i), 1);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest10bpp, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using DirectionalIntraPredTest12bpp = DirectionalIntraPredTest<12, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests12bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+      "78f3297743f75e928e755b6ffa2d3050",
+      "7315da39861c6e3ef2e47c913e3be349",
+      "5609cb40b575f24d05880df202a60bd3",
+  };
+  static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+      "efb2363d3c25427abe198806c8ba4d6b",
+      "b5aaa41665a10e7e7944fb7fc90fd59a",
+      "5a85610342339ca3109d775fa18dc25c",
+  };
+  static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+      "9045679914980ea1f579d84509397b6e",
+      "f9f50bdc9f81a93095fd9d6998174aa7",
+      "46c1f82e85b8ba5b03bab41a2f561483",
+  };
+  static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+      "a0ae0956b2b667c528b7803d733d49da",
+      "5d9f60ef8904c4faedb6cfc19e54418a",
+      "4ffdcbbbcb23bca8286f1c286b9cb3e8",
+  };
+  static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+      "086116c6b116613b8b47a086726566ea",
+      "141dca7fcae0e4d4b88887a618271ea1",
+      "3575a34278aa0fb1eed934290982f4a7",
+  };
+  static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+      "7922f40216c78a40abaf675667e79493",
+      "55d20588240171df2e24d105ee1563ad",
+      "674b4d8f4dbf514d22e21cc4baeda1d3",
+  };
+  static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+      "32d4d7e256d3b304026ddb5430cf6a09",
+      "72f4be2569f4e067c252d51ff4030de3",
+      "6779a132e1bac0ac43c2373f56553ed8",
+  };
+  static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+      "1be2e0efc1403f9e22cfb8aeb28763d9",
+      "558c8a5418ac91d21a5839c454a9391f",
+      "7693ebef9b86416ebd6e78e98fcafba7",
+  };
+  static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+      "e6217ed1c673ae42e84f8757316b580d",
+      "028aa582c11a9733f0cd693211a067c5",
+      "082de9fc7c4bc80a8ec8522b5a5cb52c",
+  };
+  static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+      "e3b293c09bdc9c5c543ad046a3f0d64f",
+      "2de5803a6ed497c1039c8e6d675c1dd3",
+      "05742f807560f5d5206e54b70097dc4a",
+  };
+  static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+      "57f2ca4ba56be253eff7e6b73df5003d",
+      "ef8bea00437e01fb798a22cda59f0191",
+      "989ff38c96600c2f108d6e6fa381fd13",
+  };
+  static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+      "f5540f4874c02aa2222a3ba75106f841",
+      "17e5d20f798a96c39abc8a81e7aa7bc6",
+      "0fe9ea14c9dcae466b4a36f1c7db6978",
+  };
+  static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+      "aff9429951ab1885c0d9ed29aa1b6a9f",
+      "4b686e2a879bf0b4aadd06b412e0eb48",
+      "39325d71cddc272bfa1dd2dc80d09ffe",
+  };
+  static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+      "b83dffdf8bad2b7c3808925b6138ca1e",
+      "3656b58c7aaf2025979b4a3ed8a2841e",
+      "cfcc0c6ae3fa5e7d45dec581479459f6",
+  };
+  static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+      "3c91b3b9e2df73ffb718e0bf53c5a5c2",
+      "0dbe27603e111158e70d99e181befb83",
+      "edecbffb32ae1e49b66b6e55ad0af6c6",
+  };
+  static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+      "a3290917f755c7ccdc7b77eb3c6c89a7",
+      "42f89db41fbb366ddb78ef79a043f3e3",
+      "7f7bcbe33aa003b166677c68d12490e9",
+  };
+  static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+      "d4f4c6b70a82695f843e9227bd7d9cc8",
+      "550a0bd87936801651d552e229b683e9",
+      "a4c730ad71f566a930c5672e1b2f48f1",
+  };
+  static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+      "2087c9264c4c5fea9a6fe20dcedbe2b9",
+      "d4dd51d9578a3fc2eb75086fba867c22",
+      "6121a67d63e40107e780d0938aeb3d21",
+  };
+  static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+      "09c3818a07bc54467634c2bfce66f58f",
+      "8da453b8d72d73d71ba15a14ddd59db4",
+      "9bc939aa54445722469b120b8a505cb3",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+  const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+#else
+  const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_),
+              static_cast<Zone>(i), num_runs);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, FixedInput) {
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_),
+              static_cast<Zone>(i), 1);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest12bpp, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+constexpr TransformSize kTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest12bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc
new file mode 100644 (file)
index 0000000..2d183cf
--- /dev/null
@@ -0,0 +1,162 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+// The recursive filter applies a different filter to the top 4 and 2 left
+// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the
+// prediction output of the blocks above and to the left, unless they are
+// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
+// according to |pred|.
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column,
+                            const FilterIntraPredictor pred, const int width,
+                            const int height) {
+  const int kMaxPixel = (1 << bitdepth) - 1;
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  Pixel buffer[3][33];  // cache 2 rows + top & left boundaries
+  memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  int row0 = 0, row2 = 2;
+  int ystep = 1;
+  int y = 0;
+  do {
+    buffer[1][0] = left[y];
+    buffer[row2][0] = left[y + 1];
+    int x = 1;
+    do {
+      const Pixel p0 = buffer[row0][x - 1];  // top-left
+      const Pixel p1 = buffer[row0][x + 0];  // top 0
+      const Pixel p2 = buffer[row0][x + 1];  // top 1
+      const Pixel p3 = buffer[row0][x + 2];  // top 2
+      const Pixel p4 = buffer[row0][x + 3];  // top 3
+      const Pixel p5 = buffer[1][x - 1];     // left 0
+      const Pixel p6 = buffer[row2][x - 1];  // left 1
+      for (int i = 0; i < 8; ++i) {
+        const int xoffset = i & 0x03;
+        const int yoffset = (i >> 2) * ystep;
+        const int value = kFilterIntraTaps[pred][i][0] * p0 +
+                          kFilterIntraTaps[pred][i][1] * p1 +
+                          kFilterIntraTaps[pred][i][2] * p2 +
+                          kFilterIntraTaps[pred][i][3] * p3 +
+                          kFilterIntraTaps[pred][i][4] * p4 +
+                          kFilterIntraTaps[pred][i][5] * p5 +
+                          kFilterIntraTaps[pred][i][6] * p6;
+        // Section 7.11.2.3 specifies the right-hand side of the assignment as
+        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+        // Since Clip1() clips a negative value to 0, it is safe to replace
+        // Round2Signed() with Round2().
+        buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+      }
+      x += 4;
+    } while (x < width);
+    memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+    dst += stride;
+
+    // The final row becomes the top for the next pass.
+    row0 ^= 2;
+    row2 ^= 2;
+    ystep = -ystep;
+    y += 2;
+  } while (y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void IntraPredFilterInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_filter.h b/src/dsp/intrapred_filter.h
new file mode 100644 (file)
index 0000000..8146b82
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredFilterInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc
new file mode 100644 (file)
index 0000000..d5694f6
--- /dev/null
@@ -0,0 +1,691 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kFilterIntraPredNames[kNumFilterIntraPredictors] = {
+    "kFilterIntraPredictorDc",         "kFilterIntraPredictorVertical",
+    "kFilterIntraPredictorHorizontal", "kFilterIntraPredictorD157",
+    "kFilterIntraPredictorPaeth",
+};
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// FilterIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  FilterIntraPredTest() = default;
+  FilterIntraPredTest(const FilterIntraPredTest&) = delete;
+  FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete;
+  ~FilterIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredFilterInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_filter_intra_pred_ = dsp->filter_intra_predictor;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      // No need to compare C with itself.
+      base_filter_intra_pred_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      IntraPredFilterInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredFilterInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    // Put the current architecture-specific implementation up for testing and
+    // comparison against C version.
+    cur_filter_intra_pred_ = dsp->filter_intra_predictor;
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumFilterIntraPredictors],
+                 int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  FilterIntraPredictorFunc base_filter_intra_pred_;
+  FilterIntraPredictorFunc cur_filter_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumFilterIntraPredictors], const int num_runs) {
+  ASSERT_NE(digests, nullptr);
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  // IntraPredInit_C() leaves the filter function empty.
+  if (cur_filter_intra_pred_ == nullptr) return;
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    for (int run = 0; run < num_runs; ++run) {
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                             static_cast<FilterIntraPredictor>(i), block_width_,
+                             block_height_);
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(ToString(tx_size_), kFilterIntraPredNames[i],
+                               digests[i], intra_pred_mem_.dst,
+                               sizeof(intra_pred_mem_.dst), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  Pixel* const left = intra_pred_mem_.left_mem + 16;
+  Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  // IntraPredInit_C() leaves the filter function empty.
+  if (cur_filter_intra_pred_ == nullptr) return;
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                           static_cast<FilterIntraPredictor>(i), block_width_,
+                           block_height_);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Expected " << kFilterIntraPredNames[i]
+                    << " to produce a block containing '"
+                    << static_cast<int>(kMaxPixel) << "'";
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_filter_intra_pred_ == nullptr) return;
+
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    // It may be worthwhile to temporarily increase this loop size when testing
+    // changes that specifically affect this test.
+    for (int n = 0; n < 10000; ++n) {
+      intra_pred_mem_.Reset(&rnd);
+
+      memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+             sizeof(intra_pred_mem_.dst));
+      const Pixel* const top = intra_pred_mem_.top_mem + 16;
+      const Pixel* const left = intra_pred_mem_.left_mem + 16;
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      base_filter_intra_pred_(intra_pred_mem_.ref_src, stride, top, left,
+                              static_cast<FilterIntraPredictor>(i),
+                              block_width_, block_height_);
+      cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                             static_cast<FilterIntraPredictor>(i), block_width_,
+                             block_height_);
+      if (!test_utils::CompareBlocks(
+              intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+              block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << kFilterIntraPredNames[i]
+                      << " differs from reference in iteration #" << n;
+        break;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+using FilterIntraPredTest8bpp = FilterIntraPredTest<8, uint8_t>;
+
+const char* const* GetFilterIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+      "a2486efcfb351d60a8941203073e89c6", "240716ae5ecaedc19edae1bdef49e05d",
+      "dacf4af66a966aca7c75abe24cd9ba99", "311888773676f3c2ae3334c4e0f141e5",
+      "2d3711616c8d8798f608e313cb07a72a",
+  };
+  static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+      "1cb74ba1abc68d936e87c13511ed5fbf", "d64c2c08586a762dbdfa8e1150bede06",
+      "73e9d1a9b6fa3e96fbd65c7dce507529", "e3ae17d9338e5aa3420d31d0e2d7ee87",
+      "750dbfe3bc5508b7031957a1d315b8bc",
+  };
+  static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+      "48a1060701bf68ec6342d6e24c10ef17", "0c91ff7988814d192ed95e840a87b4bf",
+      "efe586b891c8828c4116c9fbf50850cc", "a3bfa10be2b155826f107e9256ac3ba1",
+      "976273745b94a561fd52f5aa96fb280f",
+  };
+  static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+      "73f82633aeb28db1d254d077edefd8a9", "8eee505cdb5828e33b67ff5572445dac",
+      "9b0f101c28c66a916079fe5ed33b4021", "47fd44a7e5a5b55f067908192698e25c",
+      "eab59a3710d9bdeca8fa03a15d3f95d6",
+  };
+  static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+      "aa07b7a007c4c1d494ddb44a23c27bcd", "d27eee43f15dfcfe4c46cd46b681983b",
+      "1015d26022cf57acfdb11fd3f6b9ccb0", "4f0e00ef556fbcac2fb31e3b18869070",
+      "918c2553635763a0756b20154096bca6",
+  };
+  static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+      "a8ac58b2efb02092035cca206dbf5fbe", "0b22b000b7f124b32545bc86dd9f0142",
+      "cd6a08e023cad301c084b6ec2999da63", "c017f5f4fa5c05e7638ae4db98512b13",
+      "893e6995522e23ed3d613ef3797ca580",
+  };
+  static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+      "b3d5d4f09b778ae2b8cc0e9014c22320", "e473874a1e65228707489be9ca6477aa",
+      "91bda5a2d32780af345bb3d49324732f", "20f2ff26f004f02e8e2be49e6cadc32f",
+      "00c909b749e36142b133a7357271e83e",
+  };
+  static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+      "ef252f074fc3f5367748436e676e78ca", "cd436d8803ea40db3a849e7c869855c7",
+      "9cd8601b5d66e61fd002f8b11bfa58d9", "b982f17ee36ef0d1c2cfea20197d5666",
+      "9e350d1cd65d520194281633f566810d",
+  };
+  static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+      "9a7e0cf9b023a89ee619ee672ba2a219", "c20186bc642912ecd4d48bc4924a79b1",
+      "77de044f4c7f717f947a36fc0aa17946", "3f2fc68f11e6ee0220adb8d1ee085c8e",
+      "2f37e586769dfb88d9d4116b9c28c5ab",
+  };
+  static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+      "36c5b85b9a6b1d2e8f44f09c81adfe9c", "78494ce3a6a78aa2879ad2e24d43a005",
+      "aa30cd29a74407dbec80161745161eb2", "ae2a0975ef166e05e5e8c3701bd19e93",
+      "6322fba6f3bcb1f6c8e78160d200809c",
+  };
+  static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+      "82d54732c37424946bc73f5a78f64641", "071773c82869bb103c31e05f14ed3c2f",
+      "3a0094c150bd6e21ce1f17243b21e76b", "998ffef26fc65333ae407bbe9d41a252",
+      "6491add6b665aafc364c8c104a6a233d",
+  };
+  static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+      "c60062105dd727e94f744c35f0d2156e", "36a9e4d543701c4c546016e35e9c4337",
+      "05a8d07fe271023e63febfb44814d114", "0a28606925519d1ed067d64761619dc8",
+      "bb8c34b143910ba49b01d13e94d936ac",
+  };
+  static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+      "60e6caeec9194fcb409469e6e1393128", "5d764ead046443eb14f76822a569b056",
+      "b1bf22fcc282614354166fa1eb6e5f8b", "4b188e729fe49ae24100b3ddd8f17313",
+      "75f430fdea0b7b5b66866fd68a795a6a",
+  };
+  static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+      "5bb91a37b1979866eb23b59dd352229d", "589aa983109500749609d7be1cb79711",
+      "5e8fb1927cdbe21143494b56b5d400f6", "9e28f741d19c64b2a0577d83546d32d9",
+      "73c73237a5d891096066b186abf96854",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(FilterIntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.5e8 / (block_width_ * block_height_));
+  TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(FilterIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilterIntraPredTest10bpp = FilterIntraPredTest<10, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+      "13a9014d9e255cde8e3e85abf6ef5151", "aee33aa3f3baec87a8c019743fff40f1",
+      "fdd8ca2be424501f51fcdb603c2e757c", "aed00c082d1980d4bab45e9318b939f0",
+      "1b363db246aa5400f49479b7d5d41799",
+  };
+  static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+      "e718b9e31ba3da0392fd4b6cfba5d882", "31ba22989cdc3bb80749685f42c6c697",
+      "6bc5b3a55b94018117569cfdced17bf9", "ec29979fb4936116493dfa1cfc93901c",
+      "c6bcf564e63c42148d9917f089566432",
+  };
+  static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+      "404bddd88dff2c0414b5398287e54f18", "ff4fb3039cec6c9ffed6d259cbbfd854",
+      "7d6fa3ed9e728ff056a73c40bb6edeb6", "82845d942ad8048578e0037336905146",
+      "f3c07ea65db08c639136a5a9270f95ff",
+  };
+  static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+      "2008981638f27ba9123973a733e46c3d", "47efecf1f7628cbd8c22e168fcceb5ce",
+      "04c857ffbd1edd6e2788b17410a4a39c", "deb0236c4277b4d7b174fba407e1c9d7",
+      "5b58567f94ae9fa930f700c68c17399d",
+  };
+  static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+      "d9bab44a6d1373e758bfa0ee88239093", "29b10ddb32d9de2ff0cad6126f010ff6",
+      "1a03f9a18bdbab0811138cd969bf1f93", "e3273c24e77095ffa033a073f5bbcf7b",
+      "5187bb3df943d154cb01fb2f244ff86f",
+  };
+  static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+      "a2199f792634a56f1c4e88510e408773", "8fd8a98969d19832975ee7131cca9dbb",
+      "d897380941f75b04b1327e63f136d7d6", "d36f52a157027d53b15b7c02a7983436",
+      "0a8c23047b0364f5687b62b01f043359",
+  };
+  static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+      "5b74ea8e4f60151cf2db9b23d803a2e2", "e0d6bb5fa7d181589c31fcf2755d7c0b",
+      "42e590ffc88b8940b7aade22e13bbb6a", "e47c39ec1761aa7b5a9b1368ede7cfdc",
+      "6e963a89beac6f3a362c269d1017f9a8",
+  };
+  static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+      "9eaa079622b5dd95ad3a8feb68fa9bbb", "17e3aa6a0034e9eedcfc65b8ce6e7205",
+      "eac5a5337dbaf9bcbc3d320745c8e190", "c6ba9a7e518be04f725bc1dbd399c204",
+      "19020b82ce8bb49a511820c7e1d58e99",
+  };
+  static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+      "2d2c3255d5dfc1479a5d82a7d5a0d42e", "0fbb4ee851b4ee58c6d30dd820d19e38",
+      "fa77a1b056e8dc8efb702c7832531b32", "186269ca219dc663ad9b4a53e011a54b",
+      "c12180a6dcde0c3579befbb5304ff70b",
+  };
+  static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+      "dbb81d7ee7d3c83c271400d0160b2e83", "4da656a3ef238d90bb8339471a6fdb7e",
+      "d95006bf299b84a1b04e38d5fa8fb4f7", "742a03331f0fbd66c57df0ae31104aca",
+      "4d20aa440e38b6b7ac83c8c54d313169",
+  };
+  static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+      "6247730c93789cc25bcb837781dfa05b", "9a93e14b06dd145e35ab21a0353bdebe",
+      "6c5866353e30296a67d9bd7a65d6998d", "389d7f038d7997871745bb1305156ff9",
+      "e7640d81f891e1d06e7da75c6ae74d93",
+  };
+  static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+      "68f3a603b7c25dd78deffe91aef22834", "48c735e4aa951d6333d99e571bfeadc8",
+      "35239df0993a429fc599a3037c731e4b", "ba7dd72e04af1a1fc1b30784c11df783",
+      "78e9017f7434665d32ec59795aed0012",
+  };
+  static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+      "8cf2f11f7f77901cb0c522ad191eb998", "204c76d68c5117b89b5c3a05d5548883",
+      "f3751e41e7a595f43d8aaf9a40644e05", "81ea1a7d608d7b91dd3ede0f87e750ee",
+      "b5951334dfbe6229d828e03cd2d98538",
+  };
+  static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+      "9d8630188c3d1a4f28a6106e343c9380", "c6c92e059faa17163522409b7bf93230",
+      "62e4c959cb06ec661d98769981fbd555", "01e61673f11011571246668e36cc61c5",
+      "4530222ea1de546e202630fcf43f4526",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(FilterIntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.5e8 / (block_width_ * block_height_));
+  TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using FilterIntraPredTest12bpp = FilterIntraPredTest<12, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests12bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+      "27682e2763f742e0c7156a263af54fe1", "f6fe9b73d8a2024b3125d25a42028be3",
+      "8a232b8caa41f8c4f0b547f0aa072fd7", "411b24dc872e91de3a607f18b51c4e34",
+      "9a106b70ca2df5317afc90aba0316a98",
+  };
+  static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+      "a0d3f3a8f498727af0844a6df90da971", "bb02998e3d5d7b4643db616a5ce75c51",
+      "eaa39425427c155dea1836c37fc14f7e", "747cc4fa0c9e3418f4a15ded9f846599",
+      "c1a2aeaa01dd3edac4c26f74e01d8d57",
+  };
+  static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+      "80c01fdef14e3db28987e323801c998e", "de5a2f59384a096324eebe843d4b8ba5",
+      "f85e18efc9297793392607cdd84d8bc4", "d84bf2d9d4996c2f7fd82b6bbd52577b",
+      "9d73771de09c17bd494f1f5f75ab1111",
+  };
+  static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+      "7df2b038c4d816eb4949de6b933f0632", "0f1c45dd6e8d5534de0c9a279087ea8b",
+      "1b79f3b10facd9ffc404cbafdd73aa43", "e19adec4f14d72c5157f9faf7fc9b23e",
+      "a30ed988ea6ed797d4bf0945ffe7e330",
+  };
+  static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+      "097a0c14d89ece69e779fa755a2b75c0", "ebadfc559b20246dcd8d74413ff4d088",
+      "097c91bedc1e703b3eb54361d94df59a", "765bbad37b91e644292beac5f06811be",
+      "f3c809461fa3325f0d33087ca79c47d0",
+  };
+  static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+      "36464af48b38005b61f7f528a0b0c8ba", "47fa0868224c71d28d3cdcf247282c13",
+      "ca34bb57a37ee3e5428814ec63f52117", "420bdca6b643f4421d465345cc264167",
+      "339c124c07a611a65952dc9996ba6e12",
+  };
+  static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+      "99ca0d3b3fbdd4661a2c07bdb2752a70", "6fedae1dbfe721210b65e08dc77847dd",
+      "956810089f81dc9334103111afec2fbb", "ede4f0bee06def6d8a2037939415d845",
+      "ca146dfe0edbdac3066a0ca387fb6277",
+  };
+  static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+      "b0f7d5dbf7f9aa3f0ab13273de80dc9d", "a3537f2b60426e9f83aeef973161fcfd",
+      "d4f868f793ab232bee17b49afcfc28a0", "fc43429761d10723b5f377eb6513e59a",
+      "f59aabb06574ce24e1d1113753edb098",
+  };
+  static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+      "0b539f1e2ecf0300bf3838ab1d80952c", "44f01a4324cda8d27ea44a8bd3620526",
+      "a57819a22b422e7da9d85f09504a2c57", "dbff6a417a8f3606575acb3c98efe091",
+      "534e8e8cd4b73cb4f6ec22f903727efa",
+  };
+  static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+      "247192bd6a5c2821b8694e4669361103", "1935044a6220ac6315a58b402465b6da",
+      "bdce29a3e988b804d429da1446a34c2a", "4697132c20395fabac2662cb8b1ce35a",
+      "3d07a7beaff6925175fcd9a8e69542e6",
+  };
+  static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+      "3429b83b7ba723bdd2e3e368979b51b0", "cd099d0eb7f4a20547f91d9402e3394a",
+      "a6a7cc4e0f8ed34424264107b3657fb8", "0125ace62bec7c7ff7240bf5b6f689c5",
+      "a0722dba921b078a6d569ecb81777bf8",
+  };
+  static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+      "44b1b086ee37a93406e5db95dca825d7", "fdeed5c4644dc288f6dcc148e8d2867a",
+      "b241d112f6fa7a24c44706fb76e49132", "a782dcf01a16231276dbd20121bad640",
+      "4da9c0efd0bcb31f911af52779317fb9",
+  };
+  static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+      "bf9704995a0a868c45280cac3415c0a7", "373626072ade7c8d709ab732149fd3ae",
+      "9e4a2062aa86ac8dc5164002c953c7ca", "62eede30996d0e55afcf513fe9ad3c58",
+      "a5f3bb32688d5189341304d12e4e6449",
+  };
+  static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+      "bd93c4ddbe0f06e3f12be25ce490f68c", "bfe772b203b83c982f35a8ed0682cd16",
+      "d357ae05ce215f4c5af650ae82909081", "bd640d3c511edaac1753b64c81afb75d",
+      "4d05d67e02a7c4af7ae981b0eb8a4d7b",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(FilterIntraPredTest12bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.5e8 / (block_width_ * block_height_));
+  TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest12bpp, FixedInput) {
+  TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+// Filter-intra and Cfl predictors are available only for transform sizes
+// with max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+    kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest12bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc
new file mode 100644 (file)
index 0000000..16b8274
--- /dev/null
@@ -0,0 +1,992 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct SmoothFuncs_C {
+  SmoothFuncs_C() = delete;
+
+  static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+                             const void* left_column);
+  static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+                               const void* top_row, const void* left_column);
+};
+
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+// SmoothFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(
+      block_width >= 4 && block_height >= 4,
+      "Weights for smooth predictor undefined for block width/height < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+      // + 256. With the descale there's no need for saturation.
+      dst[x] = static_cast<Pixel>(
+          RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(block_height >= 4,
+                "Weights for smooth predictor undefined for block height < 4");
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  static_assert(block_width >= 4,
+                "Weights for smooth predictor undefined for block width < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_x[x]);
+      uint32_t pred = weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct SmoothDefs {
+  SmoothDefs() = delete;
+
+  using _4x4 = SmoothFuncs_C<4, 4, Pixel>;
+  using _4x8 = SmoothFuncs_C<4, 8, Pixel>;
+  using _4x16 = SmoothFuncs_C<4, 16, Pixel>;
+  using _8x4 = SmoothFuncs_C<8, 4, Pixel>;
+  using _8x8 = SmoothFuncs_C<8, 8, Pixel>;
+  using _8x16 = SmoothFuncs_C<8, 16, Pixel>;
+  using _8x32 = SmoothFuncs_C<8, 32, Pixel>;
+  using _16x4 = SmoothFuncs_C<16, 4, Pixel>;
+  using _16x8 = SmoothFuncs_C<16, 8, Pixel>;
+  using _16x16 = SmoothFuncs_C<16, 16, Pixel>;
+  using _16x32 = SmoothFuncs_C<16, 32, Pixel>;
+  using _16x64 = SmoothFuncs_C<16, 64, Pixel>;
+  using _32x8 = SmoothFuncs_C<32, 8, Pixel>;
+  using _32x16 = SmoothFuncs_C<32, 16, Pixel>;
+  using _32x32 = SmoothFuncs_C<32, 32, Pixel>;
+  using _32x64 = SmoothFuncs_C<32, 64, Pixel>;
+  using _64x16 = SmoothFuncs_C<64, 16, Pixel>;
+  using _64x32 = SmoothFuncs_C<64, 32, Pixel>;
+  using _64x64 = SmoothFuncs_C<64, 64, Pixel>;
+};
+
+using Defs = SmoothDefs<uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of
+// the same size.
+#define INIT_SMOOTH_WxH(DEFS, W, H)                                       \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+      DEFS::_##W##x##H::Smooth;                                           \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothVertical] =                  \
+      DEFS::_##W##x##H::SmoothVertical;                                   \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothHorizontal] =                \
+      DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_SMOOTH(DEFS)        \
+  INIT_SMOOTH_WxH(DEFS, 4, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 32);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 4);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 16); \
+  INIT_SMOOTH_WxH(DEFS, 16, 32); \
+  INIT_SMOOTH_WxH(DEFS, 16, 64); \
+  INIT_SMOOTH_WxH(DEFS, 32, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 32, 16); \
+  INIT_SMOOTH_WxH(DEFS, 32, 32); \
+  INIT_SMOOTH_WxH(DEFS, 32, 64); \
+  INIT_SMOOTH_WxH(DEFS, 64, 16); \
+  INIT_SMOOTH_WxH(DEFS, 64, 32); \
+  INIT_SMOOTH_WxH(DEFS, 64, 64)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(Defs);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(DefsHbd);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(DefsHbd);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+#undef INIT_SMOOTH_WxH
+#undef INIT_SMOOTH
+}  // namespace
+
+void IntraPredSmoothInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.h b/src/dsp/intrapred_smooth.h
new file mode 100644 (file)
index 0000000..06454af
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_smooth_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_smooth_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+enum {
+  // Weights are quadratic from '1' to '1 / block_size', scaled by
+  // 2^kSmoothWeightScale.
+  kSmoothWeightScale = 8,
+};
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc
new file mode 100644 (file)
index 0000000..5753817
--- /dev/null
@@ -0,0 +1,913 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// IntraPredTest
+
+template <int bitdepth, typename Pixel>
+class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  IntraPredTest() = default;
+  IntraPredTest(const IntraPredTest&) = delete;
+  IntraPredTest& operator=(const IntraPredTest&) = delete;
+  ~IntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredInit_C();
+    IntraPredSmoothInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    memcpy(base_intrapreds_, dsp->intra_predictors[tx_size_],
+           sizeof(base_intrapreds_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_intrapreds_, 0, sizeof(base_intrapreds_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      IntraPredInit_SSE4_1();
+      IntraPredSmoothInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredInit_NEON();
+      IntraPredSmoothInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_intrapreds_, dsp->intra_predictors[tx_size_],
+           sizeof(cur_intrapreds_));
+
+    for (int i = 0; i < kNumIntraPredictors; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_intrapreds_[i] == base_intrapreds_[i]) {
+        cur_intrapreds_[i] = nullptr;
+      }
+    }
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumIntraPredictors], int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  IntraPredictorFunc base_intrapreds_[kNumIntraPredictors];
+  IntraPredictorFunc cur_intrapreds_[kNumIntraPredictors];
+};
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumIntraPredictors], const int num_runs) {
+  ASSERT_NE(digests, nullptr);
+  const auto* const left =
+      reinterpret_cast<const uint8_t*>(intra_pred_mem_.left_mem + 16);
+  const auto* const top =
+      reinterpret_cast<const uint8_t*>(intra_pred_mem_.top_mem + 16);
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  for (int i = 0; i < kNumIntraPredictors; ++i) {
+    if (cur_intrapreds_[i] == nullptr) continue;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    for (int run = 0; run < num_runs; ++run) {
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(ToString(tx_size_),
+                               ToString(static_cast<IntraPredictor>(i)),
+                               digests[i], intra_pred_mem_.dst,
+                               sizeof(intra_pred_mem_.dst), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  Pixel* const left = intra_pred_mem_.left_mem + 16;
+  Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  // skip DcFill
+  for (int i = 1; i < kNumIntraPredictors; ++i) {
+    if (cur_intrapreds_[i] == nullptr) continue;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Expected " << ToString(static_cast<IntraPredictor>(i))
+                    << " to produce a block containing '"
+                    << static_cast<int>(kMaxPixel) << "'";
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int i = 0; i < kNumIntraPredictors; ++i) {
+    // Skip the 'C' test case as this is used as the reference.
+    if (base_intrapreds_[i] == nullptr) continue;
+    if (cur_intrapreds_[i] == nullptr) continue;
+    // It may be worthwhile to temporarily increase this loop size when testing
+    // changes that specifically affect this test.
+    for (int n = 0; n < 10000; ++n) {
+      intra_pred_mem_.Reset(&rnd);
+
+      memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+             sizeof(intra_pred_mem_.dst));
+      const Pixel* const top = intra_pred_mem_.top_mem + 16;
+      const Pixel* const left = intra_pred_mem_.left_mem + 16;
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      base_intrapreds_[i](intra_pred_mem_.ref_src, stride, top, left);
+      cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+      if (!test_utils::CompareBlocks(
+              intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+              block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << ToString(static_cast<IntraPredictor>(i))
+                      << " differs from reference in iteration #" << n;
+        break;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+using IntraPredTest8bpp = IntraPredTest<8, uint8_t>;
+
+const char* const* GetIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumIntraPredictors] = {
+      "7b1c762e28747f885d2b7d83cb8aa75c", "73353f179207f1432d40a132809e3a50",
+      "80c9237c838b0ec0674ccb070df633d5", "1cd79116b41fda884e7fa047f5eb14df",
+      "33211425772ee539a59981a2e9dc10c1", "d6f5f65a267f0e9a2752e8151cc1dcd7",
+      "7ff8c762cb766eb0665682152102ce4b", "2276b861ae4599de15938651961907ec",
+      "766982bc69f4aaaa8e71014c2dc219bc", "e2c31b5fd2199c49e17c31610339ab3f",
+  };
+  static const char* const kDigests4x8[kNumIntraPredictors] = {
+      "0a0d8641ecfa0e82f541acdc894d5574", "1a40371af6cff9c278c5b0def9e4b3e7",
+      "3631a7a99569663b514f15b590523822", "646c7b592136285bd31501494e7393e7",
+      "ecbe89cc64dc2688123d3cfe865b5237", "79048e70ecbb7d43a4703f62718588c0",
+      "f3de11bf1198a00675d806d29c41d676", "32bb6cd018f6e871c342fcc21c7180cf",
+      "6f076a1e5ab3d69cf08811d62293e4be", "2a84460a8b189b4589824cf6b3b39954",
+  };
+  static const char* const kDigests4x16[kNumIntraPredictors] = {
+      "cb8240be98444ede5ae98ca94afc1557", "460acbcf825a1fa0d8f2aa6bf2d6a21c",
+      "7896fdbbfe538dce1dc3a5b0873d74b0", "504aea29c6b27f21555d5516b8de2d8a",
+      "c5738e7fa82b91ea0e39232120da56ea", "19abbd934c243a6d9df7585d81332dd5",
+      "9e42b7b342e45c842dfa8aedaddbdfaa", "0e9eb07a89f8bf96bc219d5d1c3d9f6d",
+      "659393c31633e0f498bae384c9df5c7b", "bee3a28312da99dd550ec309ae4fff25",
+  };
+  static const char* const kDigests8x4[kNumIntraPredictors] = {
+      "5950744064518f77867c8e14ebd8b5d7", "46b6cbdc76efd03f4ac77870d54739f7",
+      "efe21fd1b98cb1663950e0bf49483b3b", "3c647b64760b298092cbb8e2f5c06bfd",
+      "c3595929687ffb04c59b128d56e2632f", "d89ad2ddf8a74a520fdd1d7019fd75b4",
+      "53907cb70ad597ee5885f6c58201f98b", "09d2282a29008b7fb47eb60ed6653d06",
+      "e341fc1c910d7cb2dac5dbc58b9c9af9", "a8fabd4c259b607a90a2e4d18cae49de",
+  };
+  static const char* const kDigests8x8[kNumIntraPredictors] = {
+      "06fb7cb52719855a38b4883b4b241749", "2013aafd42a4303efb553e42264ab8b0",
+      "2f070511d5680c12ca73a20e47fd6e23", "9923705af63e454392625794d5459fe0",
+      "04007a0d39778621266e2208a22c4fac", "2d296c202d36b4a53f1eaddda274e4a1",
+      "c87806c220d125c7563c2928e836fbbd", "339b49710a0099087e51ab5afc8d8713",
+      "c90fbc020afd9327bf35dccae099bf77", "95b356a7c346334d29294a5e2d13cfd9",
+  };
+  static const char* const kDigests8x16[kNumIntraPredictors] = {
+      "3c5a4574d96b5bb1013429636554e761", "8cf56b17c52d25eb785685f2ab48b194",
+      "7911e2e02abfbe226f17529ac5db08fc", "064e509948982f66a14293f406d88d42",
+      "5c443aa713891406d5be3af4b3cf67c6", "5d2cb98e532822ca701110cda9ada968",
+      "3d58836e17918b8890012dd96b95bb9d", "20e8d61ddc451b9e553a294073349ffd",
+      "a9aa6cf9d0dcf1977a1853ccc264e40b", "103859f85750153f47b81f68ab7881f2",
+  };
+  static const char* const kDigests8x32[kNumIntraPredictors] = {
+      "b393a2db7a76acaccc39e04d9dc3e8ac", "bbda713ee075a7ef095f0f479b5a1f82",
+      "f337dce3980f70730d6f6c2c756e3b62", "796189b05dc026e865c9e95491b255d1",
+      "ea932c21e7189eeb215c1990491320ab", "a9fffdf9455eba5e3b01317cae140289",
+      "9525dbfdbf5fba61ef9c7aa5fe887503", "8c6a7e3717ff8a459f415c79bb17341c",
+      "3761071bfaa2363a315fe07223f95a2d", "0e5aeb9b3f485b90df750469f60c15aa",
+  };
+  static const char* const kDigests16x4[kNumIntraPredictors] = {
+      "1c0a950b3ac500def73b165b6a38467c", "95e7f7300f19da280c6a506e40304462",
+      "28a6af15e31f76d3ff189012475d78f5", "e330d67b859bceef62b96fc9e1f49a34",
+      "36eca3b8083ce2fb5f7e6227dfc34e71", "08f567d2abaa8e83e4d9b33b3f709538",
+      "dc2d0ba13aa9369446932f03b53dc77d", "9ab342944c4b1357aa79d39d7bebdd3a",
+      "77ec278c5086c88b91d68eef561ed517", "60fbe11bfe216c182aaacdec326c4dae",
+  };
+  static const char* const kDigests16x8[kNumIntraPredictors] = {
+      "053a2bc4b5b7287fee524af4e77f077a", "619b720b13f14f32391a99ea7ff550d5",
+      "728d61c11b06baf7fe77881003a918b9", "889997b89a44c9976cb34f573e2b1eea",
+      "b43bfc31d1c770bb9ca5ca158c9beec4", "9d3fe9f762e0c6e4f114042147c50c7f",
+      "c74fdd7c9938603b01e7ecf9fdf08d61", "870c7336db1102f80f74526bd5a7cf4e",
+      "3fd5354a6190903d6a0b661fe177daf6", "409ca6b0b2558aeadf5ef2b8a887e67a",
+  };
+  static const char* const kDigests16x16[kNumIntraPredictors] = {
+      "1fa9e2086f6594bda60c30384fbf1635", "2098d2a030cd7c6be613edc74dc2faf8",
+      "f3c72b0c8e73f1ddca04d14f52d194d8", "6b31f2ee24cf88d3844a2fc67e1f39f3",
+      "d91a22a83575e9359c5e4871ab30ddca", "24c32a0d38b4413d2ef9bf1f842c8634",
+      "6e9e47bf9da9b2b9ae293e0bbd8ff086", "968b82804b5200b074bcdba9718140d4",
+      "4e6d7e612c5ae0bbdcc51a453cd1db3f", "ce763a41977647d072f33e277d69c7b9",
+  };
+  static const char* const kDigests16x32[kNumIntraPredictors] = {
+      "01afd04432026ff56327d6226b720be2", "a6e7be906cc6f1e7a520151bfa7c303d",
+      "bc05c46f18d0638f0228f1de64f07cd5", "204e613e429935f721a5b29cec7d44bb",
+      "aa0a7c9a7482dfc06d9685072fc5bafd", "ffb60f090d83c624bb4f7dc3a630ac4f",
+      "36bcb9ca9bb5eac520b050409de25da5", "34d9a5dd3363668391bc3bd05b468182",
+      "1e149c28db8b234e43931c347a523794", "6e8aff02470f177c3ff4416db79fc508",
+  };
+  static const char* const kDigests16x64[kNumIntraPredictors] = {
+      "727797ef15ccd8d325476fe8f12006a3", "f77c544ac8035e01920deae40cee7b07",
+      "12b0c69595328c465e0b25e0c9e3e9fc", "3b2a053ee8b05a8ac35ad23b0422a151",
+      "f3be77c0fe67eb5d9d515e92bec21eb7", "f1ece6409e01e9dd98b800d49628247d",
+      "efd2ec9bfbbd4fd1f6604ea369df1894", "ec703de918422b9e03197ba0ed60a199",
+      "739418efb89c07f700895deaa5d0b3e3", "9943ae1bbeeebfe1d3a92dc39e049d63",
+  };
+  static const char* const kDigests32x8[kNumIntraPredictors] = {
+      "4da55401331ed98acec0c516d7307513", "0ae6f3974701a5e6c20baccd26b4ca52",
+      "79b799f1eb77d5189535dc4e18873a0e", "90e943adf3de4f913864dce4e52b4894",
+      "5e1b9cc800a89ef45f5bdcc9e99e4e96", "3103405df20d254cbf32ac30872ead4b",
+      "648550e369b77687bff3c7d6f249b02f", "f9f73bcd8aadfc059fa260325df957a1",
+      "204cef70d741c25d4fe2b1d10d2649a5", "04c05e18488496eba64100faa25e8baf",
+  };
+  static const char* const kDigests32x16[kNumIntraPredictors] = {
+      "86ad1e1047abaf9959150222e8f19593", "1908cbe04eb4e5c9d35f1af7ffd7ee72",
+      "6ad3bb37ebe8374b0a4c2d18fe3ebb6a", "08d3cfe7a1148bff55eb6166da3378c6",
+      "656a722394764d17b6c42401b9e0ad3b", "4aa00c192102efeb325883737e562f0d",
+      "9881a90ca88bca4297073e60b3bb771a", "8cd74aada398a3d770fc3ace38ecd311",
+      "0a927e3f5ff8e8338984172cc0653b13", "d881d68b4eb3ee844e35e04ad6721f5f",
+  };
+  static const char* const kDigests32x32[kNumIntraPredictors] = {
+      "1303ca680644e3d8c9ffd4185bb2835b", "2a4d9f5cc8da307d4cf7dc021df10ba9",
+      "ced60d3f4e4b011a6a0314dd8a4b1fd8", "ced60d3f4e4b011a6a0314dd8a4b1fd8",
+      "1464b01aa928e9bd82c66bad0f921693", "90deadfb13d7c3b855ba21b326c1e202",
+      "af96a74f8033dff010e53a8521bc6f63", "9f1039f2ef082aaee69fcb7d749037c2",
+      "3f82893e478e204f2d254b34222d14dc", "ddb2b95ffb65b84dd4ff1f7256223305",
+  };
+  static const char* const kDigests32x64[kNumIntraPredictors] = {
+      "e1e8ed803236367821981500a3d9eebe", "0f46d124ba9f48cdd5d5290acf786d6d",
+      "4e2a2cfd8f56f15939bdfc753145b303", "0ce332b343934b34cd4417725faa85cb",
+      "1d2f8e48e3adb7c448be05d9f66f4954", "9fb2e176636a5689b26f73ca73fcc512",
+      "e720ebccae7e25e36f23da53ae5b5d6a", "86fe4364734169aaa4520d799890d530",
+      "b1870290764bb1b100d1974e2bd70f1d", "ce5b238e19d85ef69d85badfab4e63ae",
+  };
+  static const char* const kDigests64x16[kNumIntraPredictors] = {
+      "de1b736e9d99129609d6ef3a491507a0", "516d8f6eb054d74d150e7b444185b6b9",
+      "69e462c3338a9aaf993c3f7cfbc15649", "821b76b1494d4f84d20817840f719a1a",
+      "fd9b4276e7affe1e0e4ce4f428058994", "cd82fd361a4767ac29a9f406b480b8f3",
+      "2792c2f810157a4a6cb13c28529ff779", "1220442d90c4255ba0969d28b91e93a6",
+      "c7253e10b45f7f67dfee3256c9b94825", "879792198071c7e0b50b9b5010d8c18f",
+  };
+  static const char* const kDigests64x32[kNumIntraPredictors] = {
+      "e48e1ac15e97191a8fda08d62fff343e", "80c15b303235f9bc2259027bb92dfdc4",
+      "538424b24bd0830f21788e7238ca762f", "a6c5aeb722615089efbca80b02951ceb",
+      "12604b37875533665078405ef4582e35", "0048afa17bd3e1632d68b96048836530",
+      "07a0cfcb56a5eed50c4bd6c26814336b", "529d8a070de5bc6531fa3ee8f450c233",
+      "33c50a11c7d78f72434064f634305e95", "e0ef7f0559c1a50ec5a8c12011b962f7",
+  };
+  static const char* const kDigests64x64[kNumIntraPredictors] = {
+      "a1650dbcd56e10288c3e269eca37967d", "be91585259bc37bf4dc1651936e90b3e",
+      "afe020786b83b793c2bbd9468097ff6e", "6e1094fa7b50bc813aa2ba29f5df8755",
+      "9e5c34f3797e0cdd3cd9d4c05b0d8950", "bc87be7ac899cc6a28f399d7516c49fe",
+      "9811fd0d2dd515f06122f5d1bd18b784", "3c140e466f2c2c0d9cb7d2157ab8dc27",
+      "9543de76c925a8f6adc884cc7f98dc91", "df1df0376cc944afe7e74e94f53e575a",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraPredTest10bpp = IntraPredTest<10, uint16_t>;
+
+const char* const* GetIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumIntraPredictors] = {
+      "432bf9e762416bec582cb3654cbc4545", "8b9707ff4d506e0cb326f2d9a8d78705",
+      "a076275258cc5af87ed8b075136fb219", "f9587004012a8d2cecaa347331ccdf96",
+      "1c4e6890c5e6eed495fe54a6b6df8d6f", "0ae15fae8969a3c972ee895f325955a3",
+      "97db177738b831da8066df4f3fb7adbd", "4add5685b8a56991c9dce4ff7086ec25",
+      "75c6a655256188e378e70658b8f1631f", "14a27db20f9d5594ef74a7ea10c3e5ef",
+  };
+  static const char* const kDigests4x8[kNumIntraPredictors] = {
+      "9cbd7c18aca2737fa41db27150798819", "13d1e734692e27339c10b07da33c1113",
+      "0617cf74e2dd5d34ea517af1767fa47e", "c6a7b01228ccdf74af8528ef8f5f55c6",
+      "13b05d87b3d566b2f7a4b332cd8a762e", "b26ae0e8da1fe8989dfe2900fa2c3847",
+      "c30f3acdd386bdac91028fe48b751810", "04d2baf5192c5af97ca18d3b9b0d5968",
+      "a0ef82983822fc815bf1e8326cd41e33", "20bf218bae5f6b5c6d56b85f3f9bbadb",
+  };
+  static const char* const kDigests4x16[kNumIntraPredictors] = {
+      "d9b47bdddaa5e22312ff9ece7a3cae08", "cb76c79971b502dd8999a7047b3e2f86",
+      "3b09a3ff431d03b379acfdc444602540", "88608f6fcd687831e871053723cf76c3",
+      "a7bd2a17de1cf19c9a4b2c550f277a5c", "29b389f564f266a67687b8d2bc750418",
+      "4680847c30fe93c06f87e2ee1da544d6", "0e4eda11e1fe6ebe8526c2a2c5390bbb",
+      "bf3e20197282885acabb158f3a77ba59", "fccea71d1a253316b905f4a073c84a36",
+  };
+  static const char* const kDigests8x4[kNumIntraPredictors] = {
+      "05ba0ed96aac48cd94e7597f12184320", "d97d04e791904d3cedc34d5430a4d1d2",
+      "49217081a169c2d30b0a43f816d0b58b", "09e2a6a6bfe35b83e9434ee9c8dcf417",
+      "4b03c8822169ee4fa058513d65f0e32f", "cabdeebc923837ee3f2d3480354d6a81",
+      "957eda610a23a011ed25976aee94eaf0", "4a197e3dfce1f0d3870138a9b66423aa",
+      "18c0d0fbe0e96a0baf2f98fa1908cbb9", "21114e5737328cdbba9940e4f85a0855",
+  };
+  static const char* const kDigests8x8[kNumIntraPredictors] = {
+      "430e99eecda7e6434e1973dbdcc2a29d", "88864d7402c09b57735db49c58707304",
+      "8312f80b936380ceb51375e29a4fd75d", "472a7ed9c68bdbd9ecca197b7a8b3f01",
+      "4f66ee4dc0cb752c3b65d576cd06bb5c", "36383d6f61799143470129e2d5241a6f",
+      "c96279406c8d2d02771903e93a4e8d37", "4fb64f9700ed0bf08fbe7ab958535348",
+      "c008c33453ac9cf8c42ae6ec88f9941c", "39c401a9938b23e318ae7819e458daf1",
+  };
+  static const char* const kDigests8x16[kNumIntraPredictors] = {
+      "bda6b75fedfe0705f9732ff84c918672", "4ff130a47429e0762386557018ec10b2",
+      "8156557bf938d8e3a266318e57048fc5", "bdfa8e01a825ec7ae2d80519e3c94eec",
+      "108fc8e5608fe09f9cc30d7a52cbc0c1", "a2271660af5424b64c6399ca5509dee1",
+      "b09af9729f39516b28ff62363f8c0cb2", "4fe67869dac99048dfcf4d4e621884ec",
+      "311f498369a9c98f77a961bf91e73e65", "d66e78b9f41d5ee6a4b25e37ec9af324",
+  };
+  static const char* const kDigests8x32[kNumIntraPredictors] = {
+      "26c45325f02521e7e5c66c0aa0819329", "79dfb68513d4ccd2530c485f0367858e",
+      "8288e99b4d738b13956882c3ad3f03fe", "7c4993518b1620b8be8872581bb72239",
+      "2b1c3126012d981f787ed0a2601ee377", "051ba9f0c4d4fecb1fcd81fdea94cae4",
+      "320362239ad402087303a4df39512bb1", "210df35b2055c9c01b9e3e5ae24e524b",
+      "f8536db74ce68c0081bbd8799dac25f9", "27f2fe316854282579906d071af6b705",
+  };
+  static const char* const kDigests16x4[kNumIntraPredictors] = {
+      "decff67721ff7e9e65ec641e78f5ccf3", "99e3b2fbdabfa9b76b749cfb6530a9fd",
+      "accdb3d25629916963a069f1e1c0e061", "ad42855e9146748b0e235b8428487b4b",
+      "53025e465f267e7af2896ebd028447a0", "577d26fcd2d655cc77a1f1f875648699",
+      "7a61a3619267221b448b20723840e9f0", "fb4ccc569bdae3614e87bc5be1e84284",
+      "b866095d8a3e6910cc4f92f8d8d6075a", "6ba9013cba1624872bfbac111e8d344a",
+  };
+  static const char* const kDigests16x8[kNumIntraPredictors] = {
+      "2832156bd076c75f8be5622f34cb3efe", "da70e516f5a8842dd4965b80cd8d2a76",
+      "c3e137c6d79c57be2073d1eda22c8d1e", "8c5d28c7b3301b50326582dd7f89a175",
+      "9d8558775155b201cd178ab61458b642", "ecbddb9c6808e0c609c8fe537b7f7408",
+      "29a123c22cb4020170f9a80edf1208da", "653d0cd0688aa682334156f7b4599b34",
+      "1bfa66ae92a22a0346511db1713fe7df", "1802ad1e657e7fc08fc063342f471ca1",
+  };
+  static const char* const kDigests16x16[kNumIntraPredictors] = {
+      "2270c626de9d49769660ae9184a6428f", "9f069625cdcdd856e2e7ec19ff4fcd50",
+      "34167b9c413362a377aa7b1faf92ae6d", "3cec2b23d179765daea8dfb87c9efdd5",
+      "daa8f0863a5df2aef2b20999961cc8f8", "d9e4dd4bc63991e4f09cb97eb25f4db4",
+      "4e1a182fc3fcf5b9f5a73898f81c2004", "c58e4275406c9fd1c2a74b40c27afff0",
+      "b8092796fd4e4dd9d2b92afb770129ba", "75424d1f18ff00c4093743d033c6c9b6",
+  };
+  static const char* const kDigests16x32[kNumIntraPredictors] = {
+      "5aa050947f3d488537f5a68c23bb135b", "9e66143a2c3863b6fe171275a192d378",
+      "86b0c4777625e84d52913073d234f860", "9e2144fcf2107c76cec4241416bbecd5",
+      "c72be592efc72c3c86f2359b6f622aba", "c4e0e735545f78f43e21e9c39eab7b8f",
+      "52122e7c84a4bab67a8a359efb427023", "7b5fd8bb7e0744e81fd6fa4ed4c2e0fb",
+      "a9950d110bffb0411a8fcd1262dceef0", "2a2dd496f01f5d87f257ed202a703cbe",
+  };
+  static const char* const kDigests16x64[kNumIntraPredictors] = {
+      "eeb1b873e81ca428b11f162bd5b28843", "39ce7d22791f82562b0ca1e0afdf1604",
+      "6bd6bdac8982a4b84613f9963d35d5e9", "a9ac2438e87522621c7e6fe6d02c01ab",
+      "a8b9c471fe6c66ed0717e77fea77bba1", "e050b6aa38aee6e951d3be5a94a8abd0",
+      "3c5ecc31aa45e8175d37e90af247bca6", "30c0f9e412ea726970f575f910edfb94",
+      "f3d96395816ce58fb98480a5b4c32ab2", "9c14811957e013fb009dcd4a3716b338",
+  };
+  static const char* const kDigests32x8[kNumIntraPredictors] = {
+      "d6560d7fc9ae9bd7c25e2983b4a825e3", "90a67154bbdc26cd06ab0fa25fff3c53",
+      "c42d37c5a634e68fafc982626842db0b", "ecc8646d258cfa431facbc0dba168f80",
+      "9f3c167b790b52242dc8686c68eac389", "62dc3bc34406636ccec0941579461f65",
+      "5c0f0ebdb3c936d4decc40d5261aec7c", "dbfc0f056ca25e0331042da6d292e10a",
+      "14fa525d74e6774781198418d505c595", "5f95e70db03da9ed70cd79e23f19199c",
+  };
+  static const char* const kDigests32x16[kNumIntraPredictors] = {
+      "dfe3630aa9eeb1adcc8604269a309f26", "ba6180227d09f5a573f69dc6ee1faf80",
+      "03edea9d71ca3d588e1a0a69aecdf555", "2c8805415f44b4fac6692090dc1b1ddd",
+      "18efd17ed72a6e92ef8b0a692cf7a2e3", "63a6e0abfb839b43c68c23b2c43c8918",
+      "be15479205bb60f5a17baaa81a6b47ad", "243d21e1d9f9dd2b981292ac7769315a",
+      "21de1cb5269e0e1d08930c519e676bf7", "73065b3e27e9c4a3a6d043712d3d8b25",
+  };
+  static const char* const kDigests32x32[kNumIntraPredictors] = {
+      "c3136bb829088e33401b1affef91f692", "68bbcf93d17366db38bbc7605e07e322",
+      "2786be5fb7c25eeec4d2596c4154c3eb", "25ac7468e691753b8291be859aac7493",
+      "a6805ce21bfd26760e749efc8f590fa3", "5a38fd324b466e8ac43f5e289d38107e",
+      "dd0628fc5cc920b82aa941378fa907c8", "8debadbdb2dec3dc7eb43927e9d36998",
+      "61e1bc223c9e04c64152cc4531b6c099", "900b00ac1f20c0a8d22f8b026c0ee1cc",
+  };
+  static const char* const kDigests32x64[kNumIntraPredictors] = {
+      "5a591b2b83f0a6cce3c57ce164a5f983", "f42167ec516102b83b2c5176df57316b",
+      "58f3772d3df511c8289b340beb178d96", "c24166e7dc252d34ac6f92712956d751",
+      "7dca3acfe2ea09e6292a9ece2078b827", "5c029235fc0820804e40187d2b22a96e",
+      "375572944368afbc04ca97dab7fb3328", "8867235908736fd99c4022e4ed604e6e",
+      "63ec336034d62846b75558c49082870f", "46f35d85eb8499d61bfeac1c49e52531",
+  };
+  static const char* const kDigests64x16[kNumIntraPredictors] = {
+      "67755882209304659a0e6bfc324e16b9", "cd89b272fecb5f23431b3f606f590722",
+      "9bcff7d971a4af0a2d1cac6d66d83482", "d8d6bb55ebeec4f03926908d391e15ba",
+      "0eb5b5ced3e7177a1dd6a1e72e7a7d21", "92b47fe431d9cf66f9e601854f0f3017",
+      "7dc599557eddb2ea480f86fc89c76b30", "4f40175676c164320fe8005440ad9217",
+      "b00eacb24081a041127f136e9e5983ec", "cb0ab76a5e90f2eb75c38b99b9833ff8",
+  };
+  static const char* const kDigests64x32[kNumIntraPredictors] = {
+      "21d873011d1b4ef1daedd9aa8c6938ea", "4866da21db0261f738903d97081cb785",
+      "a722112233a82595a8d001a4078b834d", "24c7a133c6fcb59129c3782ef908a6c1",
+      "490e40505dd255d3a909d8a72c280cbc", "2afe719fb30bf2a664829bb74c8f9e2a",
+      "623adad2ebb8f23e355cd77ace4616cd", "d6092541e9262ad009bef79a5d350a86",
+      "ae86d8fba088683ced8abfd7e1ddf380", "32aa8aa21f2f24333d31f99e12b95c53",
+  };
+  static const char* const kDigests64x64[kNumIntraPredictors] = {
+      "6d88aeb40dfe3ac43c68808ca3c00806", "6a75d88ac291d6a3aaf0eec0ddf2aa65",
+      "30ef52d7dc451affdd587c209f5cb2dd", "e073f7969f392258eaa907cf0636452a",
+      "de10f07016a2343bcd3a9deb29f4361e", "dc35ff273fea4355d2c8351c2ed14e6e",
+      "01b9a545968ac75c3639ddabb837fa0b", "85c98ed9c0ea1523a15281bc9a909b8c",
+      "4c255f7ef7fd46db83f323806d79dca4", "fe2fe6ffb19cb8330e2f2534271d6522",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraPredTest12bpp = IntraPredTest<12, uint16_t>;
+
+const char* const* GetIntraPredDigests12bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumIntraPredictors] = {
+      "f7008e0f65bdeed97375ae5e98e3309b", "a34cc5d9d1ef875df4ee2ce010d0a80a",
+      "74f615beeb217ad317ced813851be36a", "b3312e86313805b061c66a08e09de653",
+      "2db47240c95530b39084bdacccf4bb8e", "76bb839cac394b5777c64b6d4b570a27",
+      "a74ee60527be86059e822f7463f49ad5", "b157a40aaa14391c237471ba6d148a50",
+      "d4f7bd2e97e2b23f7a6a059837a10b2a", "8a9bcb30e9aff59b6feef5d1bf546d28",
+  };
+  static const char* const kDigests4x8[kNumIntraPredictors] = {
+      "4c2a59e1d4a58c129c709f05d1a83f4a", "5fbedd99a90a20727195dfbe8f9969ad",
+      "d4645e21ccf5f6d3c4ca7a3d9b0156ba", "98aa17ea5423192c81a04afd2d2669ed",
+      "67dad5b5eefdeb2af1e4d3875b282c6c", "881dcafd6323509fb80cd5bbdf2870c4",
+      "03ece373dfd56bd2fd86ad00ad6f5000", "41b28f2578d2ed7f38e708d57b231948",
+      "9f935505190f52ff4da9556e43f607be", "815700d2abb055bce6902d130e77416d",
+  };
+  static const char* const kDigests4x16[kNumIntraPredictors] = {
+      "bfc47cd4eef143a6ebf517730756a718", "ef07a3af3e353f9dfaebc48c8ac92c82",
+      "ceec5d9d24254efd3c6a00cbf11dd24d", "4e07f512a69cf95608c3c0c3013ed808",
+      "cedb7c900bb6839026bf79d054edb4fc", "48d958a18a019809f12eb2ad2eb358bc",
+      "8f296f4b9fb621a910368609cc2cccdf", "073a6f2ca8a23d6131ff97e2a3b736e1",
+      "f4772cc60b68c4f958c08c0fd8eb8d48", "2f8946cf19abecf0fda3addbfb8f9dcf",
+  };
+  static const char* const kDigests8x4[kNumIntraPredictors] = {
+      "4f245b07a91e6d604da9f22cf277d6f1", "a6dc25d1e24ba9e842c312f67eea211d",
+      "0475204441f44ea95bfd69c6e04eaed8", "313bcf1e2fc762d31ff765d3c18a6f67",
+      "7e9223ece684a1885c2108741052c6c8", "79f1e6f070d9b1d0f1de2ff77bccc0dc",
+      "63adca1101ee4799b1cfa26d88aa0657", "e8b940a5e39ea5313930c903464de843",
+      "42a8e470d3b000f4f57c44c632f0051b", "e8a57663f73da3d4320f8e82a3fecfc2",
+  };
+  static const char* const kDigests8x8[kNumIntraPredictors] = {
+      "7fa3c8bdd9ce04dc4df27863499cf4d4", "83f1312edd9af928a1cef60613730bc3",
+      "ceb35042adc6095a545b490f20e5d81b", "73aa503f329a055ff59a24093e682c41",
+      "14a9a427525ec38d2eb13e698728e911", "9143ddf66234e74acc156565d684fcac",
+      "05182bbe4fd90f3b496033ee5b7c54f9", "d9c6184c23af1f5a903a4a00539b883a",
+      "c4c2d4000ca2defc7a8169215121d9fc", "0b938bc7782b32796bffece28d17bb69",
+  };
+  static const char* const kDigests8x16[kNumIntraPredictors] = {
+      "50197f063138616c37ef09f8bf8a3016", "ef2008f6d9f2176feb17b7d4312022e2",
+      "0d243ffbba0a2e65738d7ee768620c36", "51b52564a2733c2c56ba319db5d8e3b8",
+      "0e2b41482ac1347c3bb6d0e394fe7bec", "edb43c19850452e6b20dfb2c001adb0b",
+      "6cd29f537b5e4180f5aaefd9140b65ef", "6808f618bdff33e0f3d6db60ea487bc1",
+      "0303c17746192b0c52b4d75ea97ca24d", "225d1debd7828fa01bc9a610a443cda9",
+  };
+  static const char* const kDigests8x32[kNumIntraPredictors] = {
+      "dc047c402c6ac4014d621fbd41b460d5", "49eb33c3a112f059e02d6d4b99da8b41",
+      "c906c9105a406ae6c63e69f57ed2fc7c", "2ead452591ddd2455660f96ce79314ab",
+      "437a2a78562752ee8291227f88e0323a", "51834dbdcf1e89667ffbb931bec9006c",
+      "959c1778e11a7c61a5a97176c79ecb6a", "2e51e44dd1953fc6fccc3b1c1ca602ed",
+      "7f94114cddb0ba780cc0c8d00db3f8d2", "b5b3770e6061249a3206915a3f9464e7",
+  };
+  static const char* const kDigests16x4[kNumIntraPredictors] = {
+      "9deb173fa962d9adde8a9ae256708c32", "264624b41e43cfe9378ee9b4fb5028a6",
+      "404919a41bdc7f1a1f9d089223373bb8", "5294ed9fcc16eaf5f9a1f66a2a36ae7c",
+      "a2ed1fa4262bca265dcc62eb1586f0ac", "58494af62f86464dbe471130b2bc4ab0",
+      "fe1f25f7096fc3426cc7964326cc46ad", "cf7f6c8f7257436b9934cecf3b7523e1",
+      "6325036f243abfcd7777754e6a7bdacc", "9dce11a98e18422b04dd9d7be7d420da",
+  };
+  static const char* const kDigests16x8[kNumIntraPredictors] = {
+      "92d5b7d4033dcd8cb729bf8e166e339a", "6cbd9f198828fd3422c9bfaf8c2f1c1d",
+      "2b204014b6dc477f67b36818bcdab1ca", "2ce0b9cf224d4654168c559d7c1424c2",
+      "ec70341b9dd57b379f5283820c9461c7", "3fe1e2a20e44171c90ebca5a45b83460",
+      "0305852b25351ff472a45f45ec1638fa", "565c78271fbe3b25b0eee542095be005",
+      "8bc15e98659cef6236bcb072541bb2ca", "875c87bf4daba7cb436ea2fdb5a427dd",
+  };
+  static const char* const kDigests16x16[kNumIntraPredictors] = {
+      "c9d12bce78d8846f081345906e1315f4", "0b57c8fde6dec15458b1c289245100cb",
+      "1c11978c4e6bbc77767395c63d2f70a8", "e749f26b26b46d8cb7cb13c1c777db94",
+      "40459af05e865e94ff7adcdec1685c15", "f3ae419e99a60dbde3afa24ba6588a36",
+      "fe3912418bca24cee3132de2c193d1fc", "cdc8e3ce27a12f1cbfe01d1adf2eb6bd",
+      "ce354b30ce15a6918172dea55a292b93", "e762d01726d641194982a5fb8c148eb7",
+  };
+  static const char* const kDigests16x32[kNumIntraPredictors] = {
+      "ad8f118b07e053df3887215449633a07", "e8979aa743aef82937d93d87fc9fdb85",
+      "a8afb62cbf602cfcd4b570832afe1d55", "404183cf003764a4f032f0f4810cd42c",
+      "4afcf1bc5589a13b11679571aa953b86", "202df8f5a2d7eb3816de172608115f2b",
+      "ce42bca92d6d7f9df85dbaac72e35064", "61c463c8070b78ca2bdf578044fec440",
+      "3abf6e4d779208e15e3f9a0dfc0254f9", "13df5504084105af7c66a1b013fe44e1",
+  };
+  static const char* const kDigests16x64[kNumIntraPredictors] = {
+      "3ac1f642019493dec1b737d7a3a1b4e5", "cbf69d5d157c9f3355a4757b1d6e3414",
+      "96d00ddc7537bf7f196006591b733b4e", "8cba1b70a0bde29e8ef235cedc5faa7d",
+      "35f9ee300d7fa3c97338e81a6f21dcd4", "aae335442e77c8ebc280f16ea50ba9c7",
+      "a6140fdac2278644328be094d88731db", "2df93621b6ff100f7008432d509f4161",
+      "c77bf5aee39e7ed4a3dd715f816f452a", "02109bd63557d90225c32a8f1338258e",
+  };
+  static const char* const kDigests32x8[kNumIntraPredictors] = {
+      "155688dec409ff50f2333c14a6367247", "cf935e78abafa6ff7258c5af229f55b6",
+      "b4bf83a28ba319c597151a041ff838c3", "fe97f3e6cd5fe6c5979670c11d940dda",
+      "b898c9a989e1e72461a6f47e913d5383", "bb73baa6476ce90118e83e2fd08f2299",
+      "c93be6d8ec318bd805899466821bb779", "ab366991ef842e9d417d52241f6966e6",
+      "9e7e4c96a271e9e40771eac39c21f661", "9459f2e6d1291b8b8a2fe0635ce1a33d",
+  };
+  static const char* const kDigests32x16[kNumIntraPredictors] = {
+      "48374c1241409e26d81e5106c73da420", "97c918bdba2ece52156dbc776b9b70d4",
+      "a44ce9c03f6622a3e93bfe3b928eb6f1", "2384ad95e3e7302f20857121e187aa48",
+      "47e72c6dc0087b6fd99e91cff854c269", "142dc3cbb05b82a496780f7fc3d66ccc",
+      "4a39fb768efcd4f30d6eae816e6a68c4", "d0c31f9d52d984a0335557eafe2b47fa",
+      "81b3af5c7893729b837e4d304917f7cd", "941cbcd411887dc7fa3a5c7395690d1a",
+  };
+  static const char* const kDigests32x32[kNumIntraPredictors] = {
+      "00892ee43a1bbb11347c1f44fb94b1a2", "d66397ba868e62cec99daf5ea73bebd0",
+      "65fe746e79ac1e779caae8abcc15eb6b", "8e308fe96b9845112d79c54f9d7981a0",
+      "47bc8847a7c9aed3417cd5250ba57875", "1a4008b7f0f61a3c73a2ee1d1452b414",
+      "24d25ef488bb457a5a4c4892e47a363d", "6d9d964f5317ab32a8edf57c23775238",
+      "544fc36c1a35c588359ae492cb5bc143", "ac170d94dbd944e9723de9c18bace1a3",
+  };
+  static const char* const kDigests32x64[kNumIntraPredictors] = {
+      "7d0bd7dea26226741dbca9a97f27fa74", "a8bdc852ef704dd4975c61893e8fbc3f",
+      "f29d6d03c143ddf96fef04c19f2c8333", "ad9cfc395a5c5644a21d958c7274ac14",
+      "45c27c5cca9a91b6ae8379feb0881c9f", "8a0b78df1e001b85c874d686eac4aa1b",
+      "ce9fa75fac54a3f6c0cc3f2083b938f1", "c0dca10d88762c954af18dc9e3791a39",
+      "61df229eddfccab913b8fda4bb02f9ac", "4f4df6bc8d50a5600b573f0e44d70e66",
+  };
+  static const char* const kDigests64x16[kNumIntraPredictors] = {
+      "e99d072de858094c98b01bd4a6772634", "525da4b187acd81b1ff1116b60461141",
+      "1348f249690d9eefe09d9ad7ead2c801", "a5e2f9fb685d5f4a048e9a96affd25a4",
+      "873bfa9dc24693f19721f7c8d527f7d3", "0acfc6507bd3468e9679efc127d6e4b9",
+      "57d03f8d079c7264854e22ac1157cfae", "6c2c4036f70c7d957a9399b5436c0774",
+      "42b8e4a97b7f8416c72a5148c031c0b1", "a38a2c5f79993dfae8530e9e25800893",
+  };
+  static const char* const kDigests64x32[kNumIntraPredictors] = {
+      "68bd283cfd1a125f6b2ee47cee874d36", "b4581311a0a73d95dfac7f8f44591032",
+      "5ecc7fdc52d2f575ad4f2d0e9e6b1e11", "db9d82921fd88b24fdff6f849f2f9c87",
+      "804179f05c032908a5e36077bb87c994", "fc5fd041a8ee779015394d0c066ee43c",
+      "68f5579ccadfe9a1baafb158334a3db2", "fe237e45e215ab06d79046da9ad71e84",
+      "9a8a938a6824551bf7d21b8fd1d70ea1", "eb7332f2017cd96882c76e7136aeaf53",
+  };
+  static const char* const kDigests64x64[kNumIntraPredictors] = {
+      "d9a906c0e692b22e1b4414e71a704b7e", "12ac11889ae5f55b7781454efd706a6a",
+      "3f1ef5f473a49eba743f17a3324adf9d", "a6baa0d4bfb2269a94c7a38f86a4bccf",
+      "47d4cadd56f70c11ff8f3e5d8df81161", "de997744cf24c16c5ac2a36b02b351cc",
+      "23781211ae178ddeb6c4bb97a6bd7d83", "a79d2e28340ca34b9e37daabbf030f63",
+      "0372bd3ddfc258750a6ac106b70587f4", "228ef625d9460cbf6fa253a16a730976",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraPredTest12bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetIntraPredDigests12bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest12bpp, FixedInput) {
+  TestSpeed(GetIntraPredDigests12bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest12bpp, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+constexpr TransformSize kTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest12bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
new file mode 100644 (file)
index 0000000..0bbdffa
--- /dev/null
@@ -0,0 +1,1833 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#endif
+
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+    LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#include <cinttypes>
+#endif
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+constexpr uint8_t kTransformColumnShift = 4;
+
+template <typename T>
+int32_t RangeCheckValue(T value, int8_t range) {
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+    LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+  static_assert(
+      std::is_same<T, int32_t>::value || std::is_same<T, std::int64_t>::value,
+      "");
+  assert(range <= 32);
+  const auto min = static_cast<int32_t>(-(uint32_t{1} << (range - 1)));
+  const auto max = static_cast<int32_t>((uint32_t{1} << (range - 1)) - 1);
+  if (min > value || value > max) {
+    LIBGAV1_DLOG(ERROR,
+                 "coeff out of bit range, value: %" PRId64 " bit range %d",
+                 static_cast<int64_t>(value), range);
+    assert(min <= value && value <= max);
+  }
+#endif  // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+  static_cast<void>(range);
+  return static_cast<int32_t>(value);
+}
+
+template <typename Residual>
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_C(Residual* const dst, int a,
+                                               int b, int angle, bool flip,
+                                               int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const int64_t x = static_cast<int64_t>(dst[a] * Cos128(angle)) -
+                    static_cast<int64_t>(dst[b] * Sin128(angle));
+  const int64_t y = static_cast<int64_t>(dst[a] * Sin128(angle)) +
+                    static_cast<int64_t>(dst[b] * Cos128(angle));
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationFirstIsZero_C(Residual* const dst, int a, int b,
+                                    int angle, bool flip, int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const auto x = static_cast<int64_t>(dst[b] * -Sin128(angle));
+  const auto y = static_cast<int64_t>(dst[b] * Cos128(angle));
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationSecondIsZero_C(Residual* const dst, int a, int b,
+                                     int angle, bool flip, int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const auto x = static_cast<int64_t>(dst[a] * Cos128(angle));
+  const auto y = static_cast<int64_t>(dst[a] * Sin128(angle));
+
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void HadamardRotation_C(Residual* const dst, int a, int b, bool flip,
+                        int8_t range) {
+  if (flip) std::swap(a, b);
+  --range;
+  // For Adst and Dct, the maximum possible value for range is 20. So min and
+  // max should always fit into int32_t.
+  const int32_t min = -(1 << range);
+  const int32_t max = (1 << range) - 1;
+  const int32_t x = dst[a] + dst[b];
+  const int32_t y = dst[a] - dst[b];
+  dst[a] = Clip3(x, min, max);
+  dst[b] = Clip3(y, min, max);
+}
+
+template <int bitdepth, typename Residual>
+void ClampIntermediate(Residual* const dst, int size) {
+  // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+  // clip residual[i][j] to 16 bits.
+  if (sizeof(Residual) > 2) {
+    const Residual intermediate_clamp_max =
+        (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+    const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+    for (int j = 0; j < size; ++j) {
+      dst[j] = Clip3(dst[j], intermediate_clamp_min, intermediate_clamp_max);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+// Value for index (i, j) is computed as bitreverse(j) and interpreting that as
+// an integer with bit-length i + 2.
+// For e.g. index (2, 3) will be computed as follows:
+//   * bitreverse(3) = bitreverse(..000011) = 110000...
+//   * interpreting that as an integer with bit-length 2+2 = 4 will be 1100 = 12
+constexpr uint8_t kBitReverseLookup[kNumTransform1dSizes][64] = {
+    {0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2,
+     1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3,
+     0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3},
+    {0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5,
+     3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6,
+     1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7},
+    {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15},
+    {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+     1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+     0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+     1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31},
+    {0, 32, 16, 48, 8,  40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+     2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+     1, 33, 17, 49, 9,  41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+     3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}};
+
+template <typename Residual, int size_log2>
+void Dct_C(void* dest, int8_t range) {
+  static_assert(size_log2 >= 2 && size_log2 <= 6, "");
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  const int size = 1 << size_log2;
+  Residual temp[size];
+  memcpy(temp, dst, sizeof(temp));
+  for (int i = 0; i < size; ++i) {
+    dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]];
+  }
+  // stages 2-32 are dependent on the value of size_log2.
+  // stage 2.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 16; ++i) {
+      ButterflyRotation_C(dst, i + 32, 63 - i,
+                          63 - MultiplyBy4(kBitReverseLookup[2][i]), false,
+                          range);
+    }
+  }
+  // stage 3
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, i + 16, 31 - i,
+                          6 + MultiplyBy8(kBitReverseLookup[1][7 - i]), false,
+                          range);
+    }
+  }
+  // stage 4.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 16; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 32, MultiplyBy2(i) + 33,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 5.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, i + 8, 15 - i,
+                          12 + MultiplyBy16(kBitReverseLookup[0][3 - i]), false,
+                          range);
+    }
+  }
+  // stage 6.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 16, MultiplyBy2(i) + 17,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 7.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        ButterflyRotation_C(
+            dst, 62 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 33,
+            60 - MultiplyBy16(kBitReverseLookup[0][i]) + MultiplyBy64(j), true,
+            range);
+      }
+    }
+  }
+  // stage 8.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, i + 4, 7 - i, 56 - 32 * i, false, range);
+    }
+  }
+  // stage 9.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 4; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 10.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        ButterflyRotation_C(
+            dst, 30 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 17,
+            24 + MultiplyBy64(j) + MultiplyBy32(1 - i), true, range);
+      }
+    }
+  }
+  // stage 11.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 32,
+                           MultiplyBy4(i) - j + 35, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 12.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(dst, MultiplyBy2(i), MultiplyBy2(i) + 1, 32 + 16 * i,
+                        i == 0, range);
+  }
+  // stage 13.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 2; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5,
+                         /*flip=*/i != 0, range);
+    }
+  }
+  // stage 14.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, 14 - i, i + 9, 48 + 64 * i, true, range);
+    }
+  }
+  // stage 15.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 16,
+                           MultiplyBy4(i) - j + 19, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 16.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        ButterflyRotation_C(
+            dst, 61 - MultiplyBy8(i) - j, MultiplyBy8(i) + j + 34,
+            56 - MultiplyBy32(i) + MultiplyBy64(DivideBy2(j)), true, range);
+      }
+    }
+  }
+  // stage 17.
+  for (int i = 0; i < 2; ++i) {
+    HadamardRotation_C(dst, i, 3 - i, false, range);
+  }
+  // stage 18.
+  if (size_log2 >= 3) {
+    ButterflyRotation_C(dst, 6, 5, 32, true, range);
+  }
+  // stage 19.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11,
+                           /*flip=*/i != 0, range);
+      }
+    }
+  }
+  // stage 20.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, 29 - i, i + 18, 48 + 64 * DivideBy2(i), true,
+                          range);
+    }
+  }
+  // stage 21.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        HadamardRotation_C(dst, MultiplyBy8(i) + j + 32,
+                           MultiplyBy8(i) - j + 39, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 22.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 4; ++i) {
+      HadamardRotation_C(dst, i, 7 - i, false, range);
+    }
+  }
+  // stage 23.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, 13 - i, i + 10, 32, true, range);
+    }
+  }
+  // stage 24.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        HadamardRotation_C(dst, MultiplyBy8(i) + j + 16,
+                           MultiplyBy8(i) - j + 23, i == 1, range);
+      }
+    }
+  }
+  // stage 25.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, 59 - i, i + 36, (i < 4) ? 48 : 112, true, range);
+    }
+  }
+  // stage 26.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, i, 15 - i, false, range);
+    }
+  }
+  // stage 27.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, 27 - i, i + 20, 32, true, range);
+    }
+  }
+  // stage 28.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, i + 32, 47 - i, false, range);
+      HadamardRotation_C(dst, i + 48, 63 - i, true, range);
+    }
+  }
+  // stage 29.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 16; ++i) {
+      HadamardRotation_C(dst, i, 31 - i, false, range);
+    }
+  }
+  // stage 30.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, 55 - i, i + 40, 32, true, range);
+    }
+  }
+  // stage 31.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 32; ++i) {
+      HadamardRotation_C(dst, i, 63 - i, false, range);
+    }
+  }
+}
+
+template <int bitdepth, typename Residual, int size_log2>
+void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                 bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row && should_round) {
+    dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+  }
+
+  ButterflyRotationSecondIsZero_C(dst, 0, 1, 32, true, range);
+
+  if (is_row && row_shift > 0) {
+    dst[0] = RightShiftWithRounding(dst[0], row_shift);
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 1);
+
+  const int size = 1 << size_log2;
+  for (int i = 1; i < size; ++i) {
+    dst[i] = dst[0];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+/*
+ * Row transform max range in bits for bitdepths 8/10/12: 28/30/32.
+ * Column transform max range in bits for bitdepths 8/10/12: 28/28/30.
+ */
+template <typename Residual>
+void Adst4_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) {
+    return;
+  }
+
+  // stage 1.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+  // values stored in the s and x arrays by this process are representable by
+  // a signed integer using range + 12 bits of precision.
+  // Note the intermediate value can only exceed INT32_MAX with invalid 12-bit
+  // content. For simplicity in unoptimized code, int64_t is used for both 10 &
+  // 12-bit. SIMD implementations can allow these to rollover on platforms
+  // where this has defined behavior.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  Intermediate s[7];
+  s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+  s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
+  s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12);
+  s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12);
+  s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12);
+  s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12);
+  // stage 2.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that
+  // values stored in the variable a7 by this process are representable by a
+  // signed integer using range + 1 bits of precision.
+  const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1);
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that
+  // values stored in the variable b7 by this process are representable by a
+  // signed integer using |range| bits of precision.
+  const int32_t b7 = RangeCheckValue(a7 + dst[3], range);
+  // stage 3.
+  s[0] = RangeCheckValue(s[0] + s[3], range + 12);
+  s[1] = RangeCheckValue(s[1] - s[4], range + 12);
+  s[3] = s[2];
+  // With range checking enabled b7 would be trapped above. This prevents an
+  // integer sanitizer warning. In SIMD implementations the multiply can be
+  // allowed to rollover on platforms where this has defined behavior.
+  const auto adst2_b7 = static_cast<Intermediate>(kAdst4Multiplier[2]) * b7;
+  s[2] = RangeCheckValue(adst2_b7, range + 12);
+  // stage 4.
+  s[0] = RangeCheckValue(s[0] + s[5], range + 12);
+  s[1] = RangeCheckValue(s[1] - s[6], range + 12);
+  // stages 5 and 6.
+  const Intermediate x0 = RangeCheckValue(s[0] + s[3], range + 12);
+  const Intermediate x1 = RangeCheckValue(s[1] + s[3], range + 12);
+  Intermediate x3 = RangeCheckValue(s[0] + s[1], range + 12);
+  x3 = RangeCheckValue(x3 - s[3], range + 12);
+  auto dst_0 = static_cast<int32_t>(RightShiftWithRounding(x0, 12));
+  auto dst_1 = static_cast<int32_t>(RightShiftWithRounding(x1, 12));
+  auto dst_2 = static_cast<int32_t>(RightShiftWithRounding(s[2], 12));
+  auto dst_3 = static_cast<int32_t>(RightShiftWithRounding(x3, 12));
+  if (sizeof(Residual) == 2) {
+    // If the first argument to RightShiftWithRounding(..., 12) is only
+    // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+    // in RightShiftWithRounding(..., 12) will cause the function to return
+    // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+    dst_0 -= (dst_0 == 0x8000);
+    dst_1 -= (dst_1 == 0x8000);
+    dst_3 -= (dst_3 == 0x8000);
+  }
+  dst[0] = dst_0;
+  dst[1] = dst_1;
+  dst[2] = dst_2;
+  dst[3] = dst_3;
+}
+
+template <int bitdepth, typename Residual>
+void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row && should_round) {
+    dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 1.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+  // values stored in the s and x arrays by this process are representable by
+  // a signed integer using range + 12 bits of precision.
+  int32_t s[3];
+  s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+  s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[0], range + 12);
+  // stage 3.
+  // stage 4.
+  // stages 5 and 6.
+  int32_t dst_0 = RightShiftWithRounding(s[0], 12);
+  int32_t dst_1 = RightShiftWithRounding(s[1], 12);
+  int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+  int32_t dst_3 =
+      RightShiftWithRounding(RangeCheckValue(s[0] + s[1], range + 12), 12);
+  if (sizeof(Residual) == 2) {
+    // If the first argument to RightShiftWithRounding(..., 12) is only
+    // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+    // in RightShiftWithRounding(..., 12) will cause the function to return
+    // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+    dst_0 -= (dst_0 == 0x8000);
+    dst_1 -= (dst_1 == 0x8000);
+    dst_3 -= (dst_3 == 0x8000);
+  }
+  dst[0] = dst_0;
+  dst[1] = dst_1;
+  dst[2] = dst_2;
+  dst[3] = dst_3;
+
+  const int size = 4;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+template <typename Residual>
+void AdstInputPermutation(int32_t* LIBGAV1_RESTRICT const dst,
+                          const Residual* LIBGAV1_RESTRICT const src, int n) {
+  assert(n == 8 || n == 16);
+  for (int i = 0; i < n; ++i) {
+    dst[i] = src[((i & 1) == 0) ? n - i - 1 : i - 1];
+  }
+}
+
+constexpr int8_t kAdstOutputPermutationLookup[16] = {
+    0, 8, 12, 4, 6, 14, 10, 2, 3, 11, 15, 7, 5, 13, 9, 1};
+
+template <typename Residual>
+void AdstOutputPermutation(Residual* LIBGAV1_RESTRICT const dst,
+                           const int32_t* LIBGAV1_RESTRICT const src, int n) {
+  assert(n == 8 || n == 16);
+  const auto shift = static_cast<int8_t>(n == 8);
+  for (int i = 0; i < n; ++i) {
+    const int8_t index = kAdstOutputPermutationLookup[i] >> shift;
+    int32_t dst_i = ((i & 1) == 0) ? src[index] : -src[index];
+    if (sizeof(Residual) == 2) {
+      // If i is odd and src[index] is -32768, dst_i will be 32768, which
+      // cannot be represented as an int16_t.
+      dst_i -= (dst_i == 0x8000);
+    }
+    dst[i] = dst_i;
+  }
+}
+
+template <typename Residual>
+void Adst8_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  int32_t temp[8];
+  AdstInputPermutation(temp, dst, 8);
+  // stage 2.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i,
+                        true, range);
+  }
+  // stage 3.
+  for (int i = 0; i < 4; ++i) {
+    HadamardRotation_C(temp, i, i + 4, false, range);
+  }
+  // stage 4.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, i * 3 + 4, i + 5, 48 - 32 * i, true, range);
+  }
+  // stage 5.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+                         false, range);
+    }
+  }
+  // stage 6.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+  // stage 7.
+  AdstOutputPermutation(dst, temp, 8);
+}
+
+template <int bitdepth, typename Residual>
+void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  // stage 1.
+  int32_t temp[8];
+  // After the permutation, the dc value is in temp[1]. The remaining are zero.
+  AdstInputPermutation(temp, dst, 8);
+
+  if (is_row && should_round) {
+    temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 2.
+  ButterflyRotationFirstIsZero_C(temp, 0, 1, 60, true, range);
+
+  // stage 3.
+  temp[4] = temp[0];
+  temp[5] = temp[1];
+
+  // stage 4.
+  ButterflyRotation_C(temp, 4, 5, 48, true, range);
+
+  // stage 5.
+  temp[2] = temp[0];
+  temp[3] = temp[1];
+  temp[6] = temp[4];
+  temp[7] = temp[5];
+
+  // stage 6.
+  ButterflyRotation_C(temp, 2, 3, 32, true, range);
+  ButterflyRotation_C(temp, 6, 7, 32, true, range);
+
+  // stage 7.
+  AdstOutputPermutation(dst, temp, 8);
+
+  const int size = 8;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 8);
+}
+
+template <typename Residual>
+void Adst16_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  int32_t temp[16];
+  AdstInputPermutation(temp, dst, 16);
+  // stage 2.
+  for (int i = 0; i < 8; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i,
+                        true, range);
+  }
+  // stage 3.
+  for (int i = 0; i < 8; ++i) {
+    HadamardRotation_C(temp, i, i + 8, false, range);
+  }
+  // stage 4.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+                        56 - 32 * i, true, range);
+    ButterflyRotation_C(temp, MultiplyBy2(i) + 13, MultiplyBy2(i) + 12,
+                        8 + 32 * i, true, range);
+  }
+  // stage 5.
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy8(j), i + MultiplyBy8(j) + 4,
+                         false, range);
+    }
+  }
+  // stage 6.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ButterflyRotation_C(temp, i * 3 + MultiplyBy8(j) + 4,
+                          i + MultiplyBy8(j) + 5, 48 - 32 * i, true, range);
+    }
+  }
+  // stage 7.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+                         false, range);
+    }
+  }
+  // stage 8.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+  // stage 9.
+  AdstOutputPermutation(dst, temp, 16);
+}
+
+template <int bitdepth, typename Residual>
+void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                    bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  // stage 1.
+  int32_t temp[16];
+  // After the permutation, the dc value is in temp[1].  The remaining are zero.
+  AdstInputPermutation(temp, dst, 16);
+
+  if (is_row && should_round) {
+    temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 2.
+  ButterflyRotationFirstIsZero_C(temp, 0, 1, 62, true, range);
+
+  // stage 3.
+  temp[8] = temp[0];
+  temp[9] = temp[1];
+
+  // stage 4.
+  ButterflyRotation_C(temp, 8, 9, 56, true, range);
+
+  // stage 5.
+  temp[4] = temp[0];
+  temp[5] = temp[1];
+  temp[12] = temp[8];
+  temp[13] = temp[9];
+
+  // stage 6.
+  ButterflyRotation_C(temp, 4, 5, 48, true, range);
+  ButterflyRotation_C(temp, 12, 13, 48, true, range);
+
+  // stage 7.
+  temp[2] = temp[0];
+  temp[3] = temp[1];
+  temp[10] = temp[8];
+  temp[11] = temp[9];
+
+  temp[6] = temp[4];
+  temp[7] = temp[5];
+  temp[14] = temp[12];
+  temp[15] = temp[13];
+
+  // stage 8.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+
+  // stage 9.
+  AdstOutputPermutation(dst, temp, 16);
+
+  const int size = 16;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 16);
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+//
+// In the spec, the inverse identity transform is followed by a Round2() call:
+//   The row transforms with i = 0..(h-1) are applied as follows:
+//     ...
+//     * Otherwise, invoke the inverse identity transform process specified in
+//       section 7.13.2.15 with the input variable n equal to log2W.
+//     * Set Residual[ i ][ j ] equal to Round2( T[ j ], rowShift )
+//       for j = 0..(w-1).
+//   ...
+//   The column transforms with j = 0..(w-1) are applied as follows:
+//     ...
+//     * Otherwise, invoke the inverse identity transform process specified in
+//       section 7.13.2.15 with the input variable n equal to log2H.
+//     * Residual[ i ][ j ] is set equal to Round2( T[ i ], colShift )
+//       for i = 0..(h-1).
+//
+// Therefore, we define the identity transform functions to perform both the
+// inverse identity transform and the Round2() call. This has two advantages:
+// 1. The outputs of the inverse identity transform do not need to be stored
+//    in the Residual array. They can be stored in int32_t local variables,
+//    which have a larger range if Residual is an int16_t array.
+// 2. The inverse identity transform and the Round2() call can be jointly
+//    optimized.
+//
+// The identity transform functions have the following prototype:
+//   void Identity_C(void* dest, int8_t shift);
+//
+// The |shift| parameter is the amount of shift for the Round2() call. For row
+// transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always
+// 4. Therefore, an identity transform function can detect whether it is being
+// invoked as a row transform or a column transform by checking whether |shift|
+// is equal to 4.
+//
+// Input Range
+//
+// The inputs of row transforms, stored in the 2D array Dequant, are
+// representable by a signed integer using 8 + BitDepth bits of precision:
+//   f. Dequant[ i ][ j ] is set equal to
+//   Clip3( - ( 1 << ( 7 + BitDepth ) ), ( 1 << ( 7 + BitDepth ) ) - 1, dq2 ).
+//
+// The inputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) bits of precision:
+//   Set the variable colClampRange equal to Max( BitDepth + 6, 16 ).
+//   ...
+//   Between the row and column transforms, Residual[ i ][ j ] is set equal to
+//   Clip3( - ( 1 << ( colClampRange - 1 ) ),
+//          ( 1 << (colClampRange - 1 ) ) - 1,
+//          Residual[ i ][ j ] )
+//   for i = 0..(h-1), for j = 0..(w-1).
+//
+// Output Range
+//
+// The outputs of row transforms are representable by a signed integer using
+// 8 + BitDepth + 1 = 9 + BitDepth bits of precision, because the net effect
+// of the multiplicative factor of inverse identity transforms minus the
+// smallest row shift is an increase of at most one bit.
+//
+// Transform | Multiplicative factor | Smallest row | Net increase
+// width     | (in bits)             | shift        | in bits
+// ---------------------------------------------------------------
+//     4     |  sqrt(2)  (0.5 bits)  |      0       |    +0.5
+//     8     |     2     (1 bit)     |      0       |    +1
+//    16     | 2*sqrt(2) (1.5 bits)  |      1       |    +0.5
+//    32     |     4     (2 bits)    |      1       |    +1
+//
+// If BitDepth is 8 and Residual is an int16_t array, to avoid truncation we
+// clip the outputs (which have 17 bits of precision) to the range of int16_t
+// before storing them in the Residual array. This clipping happens to be the
+// same as the required clipping after the row transform (see the spec quoted
+// above), so we remain compliant with the spec. (In this case,
+// TransformLoop_C() skips clipping the outputs of row transforms to avoid
+// duplication of effort.)
+//
+// The outputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) + 2 - 4 = Max( BitDepth + 4, 14 ) bits of precision,
+// because the multiplicative factor of inverse identity transforms is at most
+// 4 (2 bits) and |shift| is always 4.
+
+template <typename Residual>
+void Identity4Row_C(void* dest, int8_t shift) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  assert(shift == 0 || shift == 1);
+  auto* const dst = static_cast<Residual*>(dest);
+  // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
+  // should be (1 + (1 << 1)) << 11. The following expression works for both
+  // values of |shift|.
+  const int32_t rounding = (1 + (shift << 1)) << 11;
+  for (int i = 0; i < 4; ++i) {
+    const auto intermediate =
+        static_cast<Intermediate>(dst[i]) * kIdentity4Multiplier;
+    int32_t dst_i =
+        static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity4Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  for (int i = 0; i < 4; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity4Multiplier as int32_t.
+    dst[i] = static_cast<Residual>((dst[i] * kIdentity4Multiplier + rounding) >>
+                                   (12 + kTransformColumnShift));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
+    }
+
+    const int32_t rounding = (1 + (row_shift << 1)) << 11;
+    const auto intermediate =
+        static_cast<Intermediate>(dst[0]) * kIdentity4Multiplier;
+    int32_t dst_i =
+        static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity4Multiplier + rounding) >>
+                                 (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity8Row_C(void* dest, int8_t shift) {
+  assert(shift == 0 || shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 8; ++i) {
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity8Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 8; ++i) {
+    dst[i] = static_cast<Residual>(
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 1));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
+    }
+
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+    // clip residual[i][j] to 16 bits.
+    if (sizeof(Residual) > 2) {
+      const Residual intermediate_clamp_max =
+          (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+      const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+      dst[0] = Clip3(dst[0], intermediate_clamp_min, intermediate_clamp_max);
+    }
+    return;
+  }
+
+  dst[0] = static_cast<Residual>(
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 1));
+}
+
+template <typename Residual>
+void Identity16Row_C(void* dest, int8_t shift) {
+  assert(shift == 1 || shift == 2);
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << shift)) << 11;
+  for (int i = 0; i < 16; ++i) {
+    // Note the intermediate value can only exceed 32 bits with 12-bit content.
+    // For simplicity in unoptimized code, int64_t is used for all cases.
+    const auto intermediate =
+        static_cast<Intermediate>(dst[i]) * kIdentity16Multiplier;
+    int32_t dst_i =
+        static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity16Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  for (int i = 0; i < 16; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity16Multiplier as int32_t.
+    dst[i] =
+        static_cast<Residual>((dst[i] * kIdentity16Multiplier + rounding) >>
+                              (12 + kTransformColumnShift));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
+    }
+
+    const int32_t rounding = (1 + (1 << row_shift)) << 11;
+    const auto intermediate =
+        static_cast<Intermediate>(dst[0]) * kIdentity16Multiplier;
+    int32_t dst_i =
+        static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity16Multiplier + rounding) >>
+                                 (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity32Row_C(void* dest, int8_t shift) {
+  assert(shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 32; ++i) {
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity32Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 32; ++i) {
+    dst[i] = static_cast<Residual>(
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 2));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
+    }
+
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  dst[0] = static_cast<Residual>(
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 2));
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+template <typename Residual>
+void Wht4_C(void* dest, int8_t shift) {
+  auto* const dst = static_cast<Residual*>(dest);
+  Residual temp[4];
+  temp[0] = dst[0] >> shift;
+  temp[2] = dst[1] >> shift;
+  temp[3] = dst[2] >> shift;
+  temp[1] = dst[3] >> shift;
+  temp[0] += temp[2];
+  temp[3] -= temp[1];
+  // This signed right shift must be an arithmetic shift.
+  Residual e = (temp[0] - temp[3]) >> 1;
+  dst[1] = e - temp[1];
+  dst[2] = e - temp[2];
+  dst[0] = temp[0] - dst[1];
+  dst[3] = temp[3] + dst[2];
+}
+
+template <int bitdepth, typename Residual>
+void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/,
+                  int /*row_shift*/, bool /*is_row*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int shift = range;
+
+  Residual temp = dst[0] >> shift;
+  // This signed right shift must be an arithmetic shift.
+  Residual e = temp >> 1;
+  dst[0] = temp - e;
+  dst[1] = e;
+  dst[2] = e;
+  dst[3] = e;
+
+  ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loop
+
+using InverseTransform1dFunc = void (*)(void* dst, int8_t range);
+using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range,
+                                            bool should_round, int row_shift,
+                                            bool is_row);
+
+template <int bitdepth, typename Residual, typename Pixel,
+          Transform1d transform1d_type,
+          InverseTransformDcOnlyFunc dconly_transform1d,
+          InverseTransform1dFunc transform1d_func, bool is_row>
+void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
+                     int adjusted_tx_height, void* LIBGAV1_RESTRICT src_buffer,
+                     int start_x, int start_y,
+                     void* LIBGAV1_RESTRICT dst_frame) {
+  constexpr bool lossless = transform1d_type == kTransform1dWht;
+  constexpr bool is_identity = transform1d_type == kTransform1dIdentity;
+  // The transform size of the WHT is always 4x4. Setting tx_width and
+  // tx_height to the constant 4 for the WHT speeds the code up.
+  assert(!lossless || tx_size == kTransformSize4x4);
+  const int tx_width = lossless ? 4 : kTransformWidth[tx_size];
+  const int tx_height = lossless ? 4 : kTransformHeight[tx_size];
+  const int tx_width_log2 = kTransformWidthLog2[tx_size];
+  const int tx_height_log2 = kTransformHeightLog2[tx_size];
+  auto* frame = static_cast<Array2DView<Pixel>*>(dst_frame);
+
+  // Initially this points to the dequantized values. After the transforms are
+  // applied, this buffer contains the residual.
+  Array2DView<Residual> residual(tx_height, tx_width,
+                                 static_cast<Residual*>(src_buffer));
+
+  if (is_row) {
+    // Row transform.
+    const uint8_t row_shift = lossless ? 0 : kTransformRowShift[tx_size];
+    // This is the |range| parameter of the InverseTransform1dFunc.  For lossy
+    // transforms, this will be equal to the clamping range.
+    const int8_t row_clamp_range = lossless ? 2 : (bitdepth + 8);
+    // If the width:height ratio of the transform size is 2:1 or 1:2, multiply
+    // the input to the row transform by 1 / sqrt(2), which is approximated by
+    // the fraction 2896 / 2^12.
+    const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1;
+
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift,
+                         true);
+      return;
+    }
+
+    // Row transforms need to be done only up to 32 because the rest of the rows
+    // are always all zero if |tx_height| is 64.  Otherwise, only process the
+    // rows that have a non zero coefficients.
+    for (int i = 0; i < adjusted_tx_height; ++i) {
+      // If lossless, the transform size is 4x4, so should_round is false.
+      if (!lossless && should_round) {
+        // The last 32 values of every row are always zero if the |tx_width| is
+        // 64.
+        for (int j = 0; j < std::min(tx_width, 32); ++j) {
+          residual[i][j] = RightShiftWithRounding(
+              residual[i][j] * kTransformRowMultiplier, 12);
+        }
+      }
+      // For identity transform, |transform1d_func| also performs the
+      // Round2(T[j], rowShift) call in the spec.
+      transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range);
+      if (!lossless && !is_identity && row_shift > 0) {
+        for (int j = 0; j < tx_width; ++j) {
+          residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift);
+        }
+      }
+
+      ClampIntermediate<bitdepth, Residual>(residual[i], tx_width);
+    }
+    return;
+  }
+
+  assert(!is_row);
+  constexpr uint8_t column_shift = lossless ? 0 : kTransformColumnShift;
+  // This is the |range| parameter of the InverseTransform1dFunc.  For lossy
+  // transforms, this will be equal to the clamping range.
+  const int8_t column_clamp_range = lossless ? 0 : std::max(bitdepth + 6, 16);
+  const bool flip_rows = transform1d_type == kTransform1dAdst &&
+                         kTransformFlipRowsMask.Contains(tx_type);
+  const bool flip_columns =
+      !lossless && kTransformFlipColumnsMask.Contains(tx_type);
+  const int min_value = 0;
+  const int max_value = (1 << bitdepth) - 1;
+  // Note: 64 is the maximum size of a 1D transform buffer (the largest
+  // transform size is kTransformSize64x64).
+  Residual tx_buffer[64];
+  for (int j = 0; j < tx_width; ++j) {
+    const int flipped_j = flip_columns ? tx_width - j - 1 : j;
+    int i = 0;
+    do {
+      tx_buffer[i] = residual[i][flipped_j];
+    } while (++i != tx_height);
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
+    } else {
+      // For identity transform, |transform1d_func| also performs the
+      // Round2(T[i], colShift) call in the spec.
+      transform1d_func(tx_buffer,
+                       is_identity ? column_shift : column_clamp_range);
+    }
+    const int x = start_x + j;
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int index = flip_rows ? tx_height - i - 1 : i;
+      Residual residual_value = tx_buffer[index];
+      if (!lossless && !is_identity) {
+        residual_value = RightShiftWithRounding(residual_value, column_shift);
+      }
+      (*frame)[y][x] =
+          Clip3((*frame)[y][x] + residual_value, min_value, max_value);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+template <int bitdepth, typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/false>;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/false>;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity4DcOnly_C<bitdepth, Residual>,
+                      Identity4Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity4DcOnly_C<bitdepth, Residual>,
+                      Identity4Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity8DcOnly_C<bitdepth, Residual>,
+                      Identity8Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity8DcOnly_C<bitdepth, Residual>,
+                      Identity8Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity16DcOnly_C<bitdepth, Residual>,
+                      Identity16Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity16DcOnly_C<bitdepth, Residual>,
+                      Identity16Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity32DcOnly_C<bitdepth, Residual>,
+                      Identity32Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity32DcOnly_C<bitdepth, Residual>,
+                      Identity32Column_C<Residual>, /*is_row=*/false>;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dWht,
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dWht,
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/false>;
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<8, int16_t, uint8_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<8, int16_t>, Identity4Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<8, int16_t>, Identity4Column_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<8, int16_t>, Identity8Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<8, int16_t>, Identity8Column_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<8, int16_t>, Identity16Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<8, int16_t>,
+                      Identity16Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<8, int16_t>, Identity32Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<8, int16_t>,
+                      Identity32Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dWht,
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dWht,
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<10, int32_t, uint16_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize64_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<10, int32_t>, Identity4Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<10, int32_t>,
+                      Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<10, int32_t>, Identity8Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<10, int32_t>,
+                      Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<10, int32_t>, Identity16Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<10, int32_t>,
+                      Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<10, int32_t>, Identity32Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<10, int32_t>,
+                      Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dWht
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<12, int32_t, uint16_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize64_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<12, int32_t>, Identity4Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<12, int32_t>,
+                      Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<12, int32_t>, Identity8Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<12, int32_t>,
+                      Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<12, int32_t>, Identity16Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<12, int32_t>,
+                      Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<12, int32_t>, Identity32Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<12, int32_t>,
+                      Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dWht
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void InverseTransformInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+
+  // Local functions that may be unused depending on the optimizations
+  // available.
+  static_cast<void>(kBitReverseLookup);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/inverse_transform.h b/src/dsp/inverse_transform.h
new file mode 100644 (file)
index 0000000..0916665
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+#define LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/inverse_transform_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/inverse_transform_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms. This function is not thread-safe.
+void InverseTransformInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
diff --git a/src/dsp/inverse_transform.inc b/src/dsp/inverse_transform.inc
new file mode 100644 (file)
index 0000000..55e68b6
--- /dev/null
@@ -0,0 +1,64 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for inverse transform implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// The value at index i is derived as: round(cos(pi * i / 128) * (1 << 12)).
+constexpr int16_t kCos128[65] = {
+    4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101,  0};
+
+inline int16_t Cos128(int angle) {
+  angle &= 0xff;
+
+  // If |angle| is 128, this function returns -4096 (= -2^12), which will
+  // cause the 32-bit multiplications in ButterflyRotation() to overflow if
+  // dst[a] or dst[b] is -2^19 (a possible corner case when |range| is 20):
+  //
+  //   (-2^12) * (-2^19) = 2^31, which cannot be represented as an int32_t.
+  //
+  // Note: |range| is 20 when bitdepth is 12 and a row transform is performed.
+  //
+  // Assert that this angle is never used by DCT or ADST.
+  assert(angle != 128);
+  if (angle <= 64) return kCos128[angle];
+  if (angle <= 128) return -kCos128[128 - angle];
+  if (angle <= 192) return -kCos128[angle - 128];
+  return kCos128[256 - angle];
+}
+
+inline int16_t Sin128(int angle) { return Cos128(angle - 64); }
+
+// The value for index i is derived as:
+// round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)).
+constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803};
+
+constexpr uint8_t kTransformRowShift[kNumTransformSizes] = {
+    0, 0, 1, 0, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2};
+
+constexpr bool kShouldRound[kNumTransformSizes] = {
+    false, true,  false, true, false, true, false, false, true, false,
+    true,  false, false, true, false, true, false, true,  false};
+
+constexpr int16_t kIdentity4Multiplier /* round(2^12 * sqrt(2)) */ = 0x16A1;
+constexpr int16_t kIdentity4MultiplierFraction /* round(2^12 * (sqrt(2) - 1))*/
+    = 0x6A1;
+constexpr int16_t kIdentity16Multiplier /* 2 * round(2^12 * sqrt(2)) */ = 11586;
+constexpr int16_t kTransformRowMultiplier /* round(2^12 / sqrt(2)) */ = 2896;
diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc
new file mode 100644 (file)
index 0000000..d74a33a
--- /dev/null
@@ -0,0 +1,557 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kTransform1dSizeNames[kNumTransform1dSizes] = {
+    "kTransform1dSize4", "kTransform1dSize8", "kTransform1dSize16",
+    "kTransform1dSize32", "kTransform1dSize64"};
+
+constexpr Transform1dSize kRowTransform1dSizes[] = {
+    kTransform1dSize4,  kTransform1dSize4,  kTransform1dSize4,
+    kTransform1dSize8,  kTransform1dSize8,  kTransform1dSize8,
+    kTransform1dSize8,  kTransform1dSize16, kTransform1dSize16,
+    kTransform1dSize16, kTransform1dSize16, kTransform1dSize16,
+    kTransform1dSize32, kTransform1dSize32, kTransform1dSize32,
+    kTransform1dSize32, kTransform1dSize64, kTransform1dSize64,
+    kTransform1dSize64};
+
+constexpr Transform1dSize kColTransform1dSizes[] = {
+    kTransform1dSize4,  kTransform1dSize8,  kTransform1dSize16,
+    kTransform1dSize4,  kTransform1dSize8,  kTransform1dSize16,
+    kTransform1dSize32, kTransform1dSize4,  kTransform1dSize8,
+    kTransform1dSize16, kTransform1dSize32, kTransform1dSize64,
+    kTransform1dSize8,  kTransform1dSize16, kTransform1dSize32,
+    kTransform1dSize64, kTransform1dSize16, kTransform1dSize32,
+    kTransform1dSize64};
+
+template <int bitdepth, typename SrcPixel, typename DstPixel>
+class InverseTransformTestBase : public testing::TestWithParam<TransformSize>,
+                                 public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  InverseTransformTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  InverseTransformTestBase(const InverseTransformTestBase&) = delete;
+  InverseTransformTestBase& operator=(const InverseTransformTestBase&) = delete;
+  ~InverseTransformTestBase() override = default;
+
+ protected:
+  struct InverseTransformMem {
+    void Reset(libvpx_test::ACMRandom* rnd, int width, int height) {
+      ASSERT_NE(rnd, nullptr);
+      // Limit the size of the residual values to bitdepth + sign in order
+      // to prevent outranging in the transforms.
+      const int num_bits = bitdepth + 1;
+      const int sign_shift = (bitdepth == 8 ? 16 : 32) - num_bits;
+      const int mask = (1 << num_bits) - 1;
+      // Fill residual with random data.  For widths == 64, only fill the upper
+      // left 32 x min(block_height_, 32).
+      memset(ref_src, 0, sizeof(ref_src));
+      SrcPixel* r = ref_src;
+      const int stride = width;
+      for (int y = 0; y < std::min(height, 32); ++y) {
+        for (int x = 0; x < std::min(width, 32); ++x) {
+          r[x] = rnd->Rand16() & mask;
+          // The msb of num_bits is the sign bit, so force each 16 bit value to
+          // the correct sign.
+          r[x] = (r[x] << sign_shift) >> sign_shift;
+        }
+        r += stride;
+      }
+
+      // Set frame data to random values.
+      for (int y = 0; y < kMaxBlockSize; ++y) {
+        for (int x = 0; x < kMaxBlockSize; ++x) {
+          const int mask = (1 << bitdepth) - 1;
+          cur_frame[y * kMaxBlockSize + x] = base_frame[y * kMaxBlockSize + x] =
+              rnd->Rand16() & mask;
+        }
+      }
+    }
+
+    // Set ref_src to |pixel|.
+    void Set(const SrcPixel pixel) {
+      for (auto& r : ref_src) r = pixel;
+    }
+
+    alignas(kMaxAlignment) DstPixel base_frame[kTotalPixels];
+    alignas(kMaxAlignment) DstPixel cur_frame[kTotalPixels];
+
+    alignas(kMaxAlignment) SrcPixel base_residual[kTotalPixels];
+    alignas(kMaxAlignment) SrcPixel cur_residual[kTotalPixels];
+
+    alignas(kMaxAlignment) SrcPixel ref_src[kTotalPixels];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  InverseTransformMem inverse_transform_mem_;
+};
+
+//------------------------------------------------------------------------------
+// InverseTransformTest
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+class InverseTransformTest
+    : public InverseTransformTestBase<bitdepth, Pixel, DstPixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  InverseTransformTest() = default;
+  InverseTransformTest(const InverseTransformTest&) = delete;
+  InverseTransformTest& operator=(const InverseTransformTest&) = delete;
+  ~InverseTransformTest() override = default;
+
+ protected:
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::tx_size_;
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_width_;
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_height_;
+  using InverseTransformTestBase<bitdepth, Pixel,
+                                 DstPixel>::inverse_transform_mem_;
+
+  void SetUp() override {
+    InverseTransformTestBase<bitdepth, Pixel, DstPixel>::SetUp();
+    InverseTransformInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+
+    tx_size_1d_row_ = kRowTransform1dSizes[tx_size_];
+    tx_size_1d_column_ = kColTransform1dSizes[tx_size_];
+
+    memcpy(base_inverse_transforms_, dsp->inverse_transforms,
+           sizeof(base_inverse_transforms_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_inverse_transforms_, 0, sizeof(base_inverse_transforms_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      InverseTransformInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      InverseTransformInit_NEON();
+      InverseTransformInit10bpp_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_inverse_transforms_, dsp->inverse_transforms,
+           sizeof(cur_inverse_transforms_));
+
+    for (int i = 0; i < kNumTransform1ds; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_inverse_transforms_[i][tx_size_1d_row_][kRow] ==
+          base_inverse_transforms_[i][tx_size_1d_row_][kRow]) {
+        cur_inverse_transforms_[i][tx_size_1d_row_][kRow] = nullptr;
+      }
+      if (cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] ==
+          base_inverse_transforms_[i][tx_size_1d_column_][kColumn]) {
+        cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] = nullptr;
+      }
+    }
+
+    base_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+                             inverse_transform_mem_.base_frame);
+
+    cur_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+                            inverse_transform_mem_.cur_frame);
+  }
+
+  // These tests modify inverse_transform_mem_.
+  void TestRandomValues(int num_tests);
+  void TestDcOnlyRandomValue(int num_tests);
+
+  Array2DView<DstPixel> base_frame_buffer_;
+  Array2DView<DstPixel> cur_frame_buffer_;
+
+  Transform1dSize tx_size_1d_row_ = kTransform1dSize4;
+  Transform1dSize tx_size_1d_column_ = kTransform1dSize4;
+
+  InverseTransformAddFuncs base_inverse_transforms_;
+  InverseTransformAddFuncs cur_inverse_transforms_;
+};
+
+constexpr TransformType kLibgav1TxType[kNumTransformTypes] = {
+    kTransformTypeDctDct,           kTransformTypeAdstDct,
+    kTransformTypeDctAdst,          kTransformTypeAdstAdst,
+    kTransformTypeFlipadstDct,      kTransformTypeDctFlipadst,
+    kTransformTypeFlipadstFlipadst, kTransformTypeAdstFlipadst,
+    kTransformTypeFlipadstAdst,     kTransformTypeIdentityIdentity,
+    kTransformTypeIdentityDct,      kTransformTypeDctIdentity,
+    kTransformTypeIdentityAdst,     kTransformTypeAdstIdentity,
+    kTransformTypeIdentityFlipadst, kTransformTypeFlipadstIdentity};
+
+// Maps TransformType to dsp::Transform1d for the row transforms.
+constexpr Transform1d kRowTransform[kNumTransformTypes] = {
+    kTransform1dDct,      kTransform1dAdst,     kTransform1dDct,
+    kTransform1dAdst,     kTransform1dAdst,     kTransform1dDct,
+    kTransform1dAdst,     kTransform1dAdst,     kTransform1dAdst,
+    kTransform1dIdentity, kTransform1dIdentity, kTransform1dDct,
+    kTransform1dIdentity, kTransform1dAdst,     kTransform1dIdentity,
+    kTransform1dAdst};
+
+// Maps TransformType to dsp::Transform1d for the column transforms.
+constexpr Transform1d kColumnTransform[kNumTransformTypes] = {
+    kTransform1dDct,      kTransform1dDct,      kTransform1dAdst,
+    kTransform1dAdst,     kTransform1dDct,      kTransform1dAdst,
+    kTransform1dAdst,     kTransform1dAdst,     kTransform1dAdst,
+    kTransform1dIdentity, kTransform1dDct,      kTransform1dIdentity,
+    kTransform1dAdst,     kTransform1dIdentity, kTransform1dAdst,
+    kTransform1dIdentity};
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+    BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
+    BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+bool IsTxSizeTypeValid(TransformSize tx_size, TransformType tx_type) {
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  TransformSet tx_set;
+  if (tx_size_square_max > kTransformSize32x32) {
+    tx_set = kTransformSetDctOnly;
+  } else if (tx_size_square_max == kTransformSize32x32) {
+    tx_set = kTransformSetInter3;
+  } else if (tx_size_square_max == kTransformSize16x16) {
+    tx_set = kTransformSetInter2;
+  } else {
+    tx_set = kTransformSetInter1;
+  }
+  return kTransformTypeInSetMask[tx_set].Contains(tx_type);
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
+    int num_tests) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  for (int tx_type_idx = -1; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+    const TransformType tx_type = (tx_type_idx == -1)
+                                      ? kTransformTypeDctDct
+                                      : kLibgav1TxType[tx_type_idx];
+    const Transform1d row_transform =
+        (tx_type_idx == -1) ? kTransform1dWht : kRowTransform[tx_type];
+    const Transform1d column_transform =
+        (tx_type_idx == -1) ? kTransform1dWht : kColumnTransform[tx_type];
+
+    // Skip the 'C' test case as this is used as the reference.
+    if (base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        base_inverse_transforms_[column_transform][tx_size_1d_column_]
+                                [kColumn] == nullptr ||
+        cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+                               [kColumn] == nullptr) {
+      continue;
+    }
+
+    // Only test valid tx_size for given tx_type.  See 5.11.40.
+    if (!IsTxSizeTypeValid(tx_size_, tx_type)) continue;
+
+    absl::Duration base_elapsed_time[2];
+    absl::Duration cur_elapsed_time[2];
+
+    for (int n = 0; n < num_tests; ++n) {
+      const int tx_height = std::min(block_height_, 32);
+      const int start_x = 0;
+      const int start_y = 0;
+
+      inverse_transform_mem_.Reset(&rnd, block_width_, block_height_);
+      memcpy(inverse_transform_mem_.base_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+      memcpy(inverse_transform_mem_.cur_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+
+      const absl::Time base_row_start = absl::Now();
+      base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+      const absl::Time cur_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+          start_x, start_y, &cur_frame_buffer_);
+      cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+      const absl::Time base_column_start = absl::Now();
+      base_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+      const absl::Time cur_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+          start_x, start_y, &cur_frame_buffer_);
+      cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+      if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+                                     inverse_transform_mem_.cur_frame,
+                                     block_width_, block_height_, kMaxBlockSize,
+                                     kMaxBlockSize, false)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << ToString(
+                             static_cast<Transform1dSize>(tx_size_1d_column_))
+                      << " differs from reference in iteration #" << n
+                      << " tx_type_idx:" << tx_type_idx;
+        break;
+      }
+    }
+
+    if (num_tests > 1) {
+      const auto base_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+      const auto cur_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+      printf("TxType %30s[%19s]:: base_row: %5d us  cur_row: %5d us  %2.2fx \n",
+             (tx_type_idx == -1) ? ToString(row_transform) : ToString(tx_type),
+             kTransform1dSizeNames[tx_size_1d_row_], base_row_elapsed_time_us,
+             cur_row_elapsed_time_us,
+             static_cast<float>(base_row_elapsed_time_us) /
+                 static_cast<float>(cur_row_elapsed_time_us));
+      const auto base_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+      const auto cur_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+      printf(
+          "TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
+          (tx_type_idx == -1) ? ToString(column_transform) : ToString(tx_type),
+          kTransform1dSizeNames[tx_size_1d_column_],
+          base_column_elapsed_time_us, cur_column_elapsed_time_us,
+          static_cast<float>(base_column_elapsed_time_us) /
+              static_cast<float>(cur_column_elapsed_time_us));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestDcOnlyRandomValue(
+    int num_tests) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+    const TransformType tx_type = kLibgav1TxType[tx_type_idx];
+    const Transform1d row_transform = kRowTransform[tx_type];
+    const Transform1d column_transform = kColumnTransform[tx_type];
+
+    if (cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+                               [kColumn] == nullptr) {
+      continue;
+    }
+
+    // Only test valid tx_size for given tx_type.  See 5.11.40.
+    if (IsTxSizeTypeValid(tx_size_, tx_type) == 0) continue;
+
+    absl::Duration base_elapsed_time[2];
+    absl::Duration cur_elapsed_time[2];
+
+    for (int n = 0; n < num_tests; ++n) {
+      const int tx_height = std::min(block_height_, 32);
+      const int start_x = 0;
+      const int start_y = 0;
+
+      // Using width == 1 and height == 1 will reset only the dc value.
+      inverse_transform_mem_.Reset(&rnd, 1, 1);
+      memcpy(inverse_transform_mem_.base_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+      memcpy(inverse_transform_mem_.cur_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+
+      // For this test, the "base" contains the output when the
+      // tx_height is set to the max for the given block size.  The
+      // "cur" contains the output when the passed in tx_height is 1.
+      // Compare the outputs for match.
+      const absl::Time base_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+      const absl::Time cur_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, /*adjusted_tx_height=*/1,
+          inverse_transform_mem_.cur_residual, start_x, start_y,
+          &cur_frame_buffer_);
+      cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+      const absl::Time base_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+      const absl::Time cur_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, /*adjusted_tx_height=*/1,
+          inverse_transform_mem_.cur_residual, start_x, start_y,
+          &cur_frame_buffer_);
+      cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+      if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+                                     inverse_transform_mem_.cur_frame,
+                                     block_width_, block_height_, kMaxBlockSize,
+                                     kMaxBlockSize, false)) {
+        ADD_FAILURE() << "Result from dc only version of "
+                      << ToString(
+                             static_cast<Transform1dSize>(tx_size_1d_column_))
+                      << " differs from reference in iteration #" << n
+                      << "tx_type_idx:" << tx_type_idx;
+        break;
+      }
+    }
+
+    if (num_tests > 1) {
+      const auto base_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+      const auto cur_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+      printf("TxType %30s[%19s]:: base_row: %5d us  cur_row: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransform1dSizeNames[tx_size_1d_row_],
+             base_row_elapsed_time_us, cur_row_elapsed_time_us,
+             static_cast<float>(base_row_elapsed_time_us) /
+                 static_cast<float>(cur_row_elapsed_time_us));
+      const auto base_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+      const auto cur_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+      printf("TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransform1dSizeNames[tx_size_1d_column_],
+             base_column_elapsed_time_us, cur_column_elapsed_time_us,
+             static_cast<float>(base_column_elapsed_time_us) /
+                 static_cast<float>(cur_column_elapsed_time_us));
+    }
+  }
+}
+
+using InverseTransformTest8bpp = InverseTransformTest<8, int16_t, uint8_t>;
+
+TEST_P(InverseTransformTest8bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest8bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest8bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+constexpr TransformSize kTransformSizesAll[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using InverseTransformTest10bpp = InverseTransformTest<10, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest10bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest10bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest10bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest10bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using InverseTransformTest12bpp = InverseTransformTest<12, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest12bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest12bpp, DISABLED_Speed) { TestRandomValues(12000); }
+
+TEST_P(InverseTransformTest12bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest12bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize param) {
+  return os << ToString(param);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
new file mode 100644 (file)
index 0000000..fedb35b
--- /dev/null
@@ -0,0 +1,204 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_)
+  return()
+endif() # LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_
+set(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_ 1)
+
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+
+list(APPEND libgav1_dsp_sources
+            "${libgav1_source}/dsp/average_blend.cc"
+            "${libgav1_source}/dsp/average_blend.h"
+            "${libgav1_source}/dsp/cdef.cc"
+            "${libgav1_source}/dsp/cdef.h"
+            "${libgav1_source}/dsp/cdef.inc"
+            "${libgav1_source}/dsp/common.h"
+            "${libgav1_source}/dsp/constants.cc"
+            "${libgav1_source}/dsp/constants.h"
+            "${libgav1_source}/dsp/convolve.cc"
+            "${libgav1_source}/dsp/convolve.h"
+            "${libgav1_source}/dsp/convolve.inc"
+            "${libgav1_source}/dsp/distance_weighted_blend.cc"
+            "${libgav1_source}/dsp/distance_weighted_blend.h"
+            "${libgav1_source}/dsp/dsp.cc"
+            "${libgav1_source}/dsp/dsp.h"
+            "${libgav1_source}/dsp/film_grain.cc"
+            "${libgav1_source}/dsp/film_grain.h"
+            "${libgav1_source}/dsp/film_grain_common.h"
+            "${libgav1_source}/dsp/intra_edge.cc"
+            "${libgav1_source}/dsp/intra_edge.h"
+            "${libgav1_source}/dsp/intrapred_cfl.cc"
+            "${libgav1_source}/dsp/intrapred_cfl.h"
+            "${libgav1_source}/dsp/intrapred_directional.cc"
+            "${libgav1_source}/dsp/intrapred_directional.h"
+            "${libgav1_source}/dsp/intrapred_filter.cc"
+            "${libgav1_source}/dsp/intrapred_filter.h"
+            "${libgav1_source}/dsp/intrapred.cc"
+            "${libgav1_source}/dsp/intrapred.h"
+            "${libgav1_source}/dsp/intrapred_smooth.cc"
+            "${libgav1_source}/dsp/intrapred_smooth.h"
+            "${libgav1_source}/dsp/inverse_transform.cc"
+            "${libgav1_source}/dsp/inverse_transform.h"
+            "${libgav1_source}/dsp/inverse_transform.inc"
+            "${libgav1_source}/dsp/loop_filter.cc"
+            "${libgav1_source}/dsp/loop_filter.h"
+            "${libgav1_source}/dsp/loop_restoration.cc"
+            "${libgav1_source}/dsp/loop_restoration.h"
+            "${libgav1_source}/dsp/mask_blend.cc"
+            "${libgav1_source}/dsp/mask_blend.h"
+            "${libgav1_source}/dsp/motion_field_projection.cc"
+            "${libgav1_source}/dsp/motion_field_projection.h"
+            "${libgav1_source}/dsp/motion_vector_search.cc"
+            "${libgav1_source}/dsp/motion_vector_search.h"
+            "${libgav1_source}/dsp/obmc.cc"
+            "${libgav1_source}/dsp/obmc.h"
+            "${libgav1_source}/dsp/obmc.inc"
+            "${libgav1_source}/dsp/smooth_weights.inc"
+            "${libgav1_source}/dsp/super_res.cc"
+            "${libgav1_source}/dsp/super_res.h"
+            "${libgav1_source}/dsp/warp.cc"
+            "${libgav1_source}/dsp/warp.h"
+            "${libgav1_source}/dsp/weight_mask.cc"
+            "${libgav1_source}/dsp/weight_mask.h")
+
+list(APPEND libgav1_dsp_sources_avx2
+            ${libgav1_dsp_sources_avx2}
+            "${libgav1_source}/dsp/x86/cdef_avx2.cc"
+            "${libgav1_source}/dsp/x86/cdef_avx2.h"
+            "${libgav1_source}/dsp/x86/convolve_avx2.cc"
+            "${libgav1_source}/dsp/x86/convolve_avx2.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.h")
+
+list(APPEND libgav1_dsp_sources_neon
+            ${libgav1_dsp_sources_neon}
+            "${libgav1_source}/dsp/arm/average_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/average_blend_neon.h"
+            "${libgav1_source}/dsp/arm/cdef_neon.cc"
+            "${libgav1_source}/dsp/arm/cdef_neon.h"
+            "${libgav1_source}/dsp/arm/common_neon.h"
+            "${libgav1_source}/dsp/arm/convolve_10bit_neon.cc"
+            "${libgav1_source}/dsp/arm/convolve_neon.cc"
+            "${libgav1_source}/dsp/arm/convolve_neon.h"
+            "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.h"
+            "${libgav1_source}/dsp/arm/film_grain_neon.cc"
+            "${libgav1_source}/dsp/arm/film_grain_neon.h"
+            "${libgav1_source}/dsp/arm/intra_edge_neon.cc"
+            "${libgav1_source}/dsp/arm/intra_edge_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_directional_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h"
+            "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
+            "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
+            "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
+            "${libgav1_source}/dsp/arm/loop_filter_10bit_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_filter_neon.h"
+            "${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_restoration_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_restoration_neon.h"
+            "${libgav1_source}/dsp/arm/mask_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/mask_blend_neon.h"
+            "${libgav1_source}/dsp/arm/motion_field_projection_neon.cc"
+            "${libgav1_source}/dsp/arm/motion_field_projection_neon.h"
+            "${libgav1_source}/dsp/arm/motion_vector_search_neon.cc"
+            "${libgav1_source}/dsp/arm/motion_vector_search_neon.h"
+            "${libgav1_source}/dsp/arm/obmc_neon.cc"
+            "${libgav1_source}/dsp/arm/obmc_neon.h"
+            "${libgav1_source}/dsp/arm/super_res_neon.cc"
+            "${libgav1_source}/dsp/arm/super_res_neon.h"
+            "${libgav1_source}/dsp/arm/warp_neon.cc"
+            "${libgav1_source}/dsp/arm/warp_neon.h"
+            "${libgav1_source}/dsp/arm/weight_mask_neon.cc"
+            "${libgav1_source}/dsp/arm/weight_mask_neon.h")
+
+list(APPEND libgav1_dsp_sources_sse4
+            ${libgav1_dsp_sources_sse4}
+            "${libgav1_source}/dsp/x86/average_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/average_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/common_sse4.h"
+            "${libgav1_source}/dsp/x86/cdef_sse4.cc"
+            "${libgav1_source}/dsp/x86/cdef_sse4.h"
+            "${libgav1_source}/dsp/x86/convolve_sse4.cc"
+            "${libgav1_source}/dsp/x86/convolve_sse4.h"
+            "${libgav1_source}/dsp/x86/convolve_sse4.inc"
+            "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.cc"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.h"
+            "${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
+            "${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h"
+            "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
+            "${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
+            "${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_filter_sse4.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
+            "${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/mask_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h"
+            "${libgav1_source}/dsp/x86/obmc_sse4.cc"
+            "${libgav1_source}/dsp/x86/obmc_sse4.h"
+            "${libgav1_source}/dsp/x86/super_res_sse4.cc"
+            "${libgav1_source}/dsp/x86/super_res_sse4.h"
+            "${libgav1_source}/dsp/x86/transpose_sse4.h"
+            "${libgav1_source}/dsp/x86/warp_sse4.cc"
+            "${libgav1_source}/dsp/x86/warp_sse4.h"
+            "${libgav1_source}/dsp/x86/weight_mask_sse4.cc"
+            "${libgav1_source}/dsp/x86/weight_mask_sse4.h")
+
+macro(libgav1_add_dsp_targets)
+  unset(dsp_sources)
+  list(APPEND dsp_sources ${libgav1_dsp_sources}
+              ${libgav1_dsp_sources_neon}
+              ${libgav1_dsp_sources_avx2}
+              ${libgav1_dsp_sources_sse4})
+
+  libgav1_add_library(NAME
+                      libgav1_dsp
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${dsp_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      $<$<CONFIG:Debug>:LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS>
+                      INCLUDES
+                      ${libgav1_include_paths})
+endmacro()
diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc
new file mode 100644 (file)
index 0000000..bb0583f
--- /dev/null
@@ -0,0 +1,689 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// 7.14.6.1.
+template <int bitdepth, typename Pixel>
+struct LoopFilterFuncs_C {
+  LoopFilterFuncs_C() = delete;
+
+  static constexpr int kMaxPixel = (1 << bitdepth) - 1;
+  static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1));
+  static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
+  static constexpr int kFlatThresh = 1 << (bitdepth - 8);
+
+  static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                         int inner_thresh, int hev_thresh);
+  static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                           int inner_thresh, int hev_thresh);
+};
+
+inline void AdjustThresholds(const int bitdepth, int* const outer_thresh,
+                             int* const inner_thresh, int* const hev_thresh) {
+  assert(*outer_thresh >= 7 && *outer_thresh <= 3 * kMaxLoopFilterValue + 4);
+  assert(*inner_thresh >= 1 && *inner_thresh <= kMaxLoopFilterValue);
+  assert(*hev_thresh >= 0 && *hev_thresh <= 3);
+  *outer_thresh <<= bitdepth - 8;
+  *inner_thresh <<= bitdepth - 8;
+  *hev_thresh <<= bitdepth - 8;
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter4(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step];
+  return std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool Hev(const Pixel* p, ptrdiff_t step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (std::abs(p1 - p0) > thresh) || (std::abs(q1 - q0) > thresh);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 2 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter2_C(Pixel* p, ptrdiff_t step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int min_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+  const int max_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+  // 8bpp: [-893,892], 10bpp: [-3581,3580], 12bpp [-14333,14332]
+  const int a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // 8bpp: [-16,15], 10bpp: [-64,63], 12bpp: [-256,255]
+  const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+  const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+  p[-step] = Clip3(p0 + a2, 0, max_unsigned_val);
+  p[0] = Clip3(q0 - a1, 0, max_unsigned_val);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 4 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter4_C(Pixel* p, ptrdiff_t step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int min_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+  const int max_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+  const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+  const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int a3 = (a1 + 1) >> 1;
+  const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+  p[-2 * step] = Clip3(p1 + a3, 0, max_unsigned_val);
+  p[-1 * step] = Clip3(p0 + a2, 0, max_unsigned_val);
+  p[0 * step] = Clip3(q0 - a1, 0, max_unsigned_val);
+  p[1 * step] = Clip3(q1 - a3, 0, max_unsigned_val);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical4(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter4(dst, 1, outer_thresh, inner_thresh)) {
+      if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal4(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter4(dst, stride, outer_thresh, inner_thresh)) {
+      if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter6(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  return std::abs(p2 - p1) <= inner_thresh &&
+         std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(q2 - q1) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat3(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+         std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter6(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 6 pixels in, 4 pixels out.
+template <typename Pixel>
+inline void Filter6_C(Pixel* p, ptrdiff_t step) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  const int a1 = 2 * p1;
+  const int a0 = 2 * p0;
+  const int b0 = 2 * q0;
+  const int b1 = 2 * q1;
+  // The max is 8 * max_pixel + 4 for the rounder.
+  // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+  p[-2 * step] = ApplyFilter6<Pixel>(3 * p2 + a1 + a0 + q0);
+  p[-1 * step] = ApplyFilter6<Pixel>(p2 + a1 + a0 + b0 + q1);
+  p[0 * step] = ApplyFilter6<Pixel>(p1 + a0 + b0 + b1 + q2);
+  p[1 * step] = ApplyFilter6<Pixel>(p0 + b0 + b1 + 3 * q2);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical6(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter6(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat3(dst, 1, flat_thresh)) {
+        Filter6_C(dst, 1);
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal6(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter6(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat3(dst, stride, flat_thresh)) {
+        Filter6_C(dst, stride);
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter8(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  return std::abs(p3 - p2) <= inner_thresh &&
+         std::abs(p2 - p1) <= inner_thresh &&
+         std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(q2 - q1) <= inner_thresh &&
+         std::abs(q3 - q2) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+         std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh &&
+         std::abs(p3 - p0) <= flat_thresh && std::abs(q3 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter8(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 8 pixels in, 6 pixels out.
+template <typename Pixel>
+inline void Filter8_C(Pixel* p, ptrdiff_t step) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  // The max is 8 * max_pixel + 4 for the rounder.
+  // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+  p[-3 * step] = ApplyFilter8<Pixel>(3 * p3 + 2 * p2 + p1 + p0 + q0);
+  p[-2 * step] = ApplyFilter8<Pixel>(2 * p3 + p2 + 2 * p1 + p0 + q0 + q1);
+  p[-1 * step] = ApplyFilter8<Pixel>(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2);
+  p[0 * step] = ApplyFilter8<Pixel>(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3);
+  p[1 * step] = ApplyFilter8<Pixel>(p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3);
+  p[2 * step] = ApplyFilter8<Pixel>(p0 + q0 + q1 + 2 * q2 + 3 * q3);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical8(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, 1, flat_thresh)) {
+        Filter8_C(dst, 1);
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal8(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, stride, flat_thresh)) {
+        Filter8_C(dst, stride);
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlatOuter4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+  return std::abs(p4 - p0) <= flat_thresh && std::abs(q4 - q0) <= flat_thresh &&
+         std::abs(p5 - p0) <= flat_thresh && std::abs(q5 - q0) <= flat_thresh &&
+         std::abs(p6 - p0) <= flat_thresh && std::abs(q6 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter14(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 4));
+}
+
+// 7.14.6.4.
+// 14 pixels in, 12 pixels out.
+template <typename Pixel>
+inline void Filter14_C(Pixel* p, ptrdiff_t step) {
+  const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+            p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step],
+            q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+  // The max is 16 * max_pixel + 8 for the rounder.
+  // 8bpp: 4088 (12 bits), 10bpp: 16376 (14 bits), 12bpp: 65528 (16 bits)
+  p[-6 * step] =
+      ApplyFilter14<Pixel>(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0);
+  p[-5 * step] = ApplyFilter14<Pixel>(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 +
+                                      p1 + p0 + q0 + q1);
+  p[-4 * step] = ApplyFilter14<Pixel>(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 +
+                                      p1 + p0 + q0 + q1 + q2);
+  p[-3 * step] = ApplyFilter14<Pixel>(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 +
+                                      p1 * 2 + p0 + q0 + q1 + q2 + q3);
+  p[-2 * step] = ApplyFilter14<Pixel>(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+                                      p0 * 2 + q0 + q1 + q2 + q3 + q4);
+  p[-1 * step] = ApplyFilter14<Pixel>(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                      q0 * 2 + q1 + q2 + q3 + q4 + q5);
+  p[0 * step] = ApplyFilter14<Pixel>(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                     q1 * 2 + q2 + q3 + q4 + q5 + q6);
+  p[1 * step] = ApplyFilter14<Pixel>(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                     q2 * 2 + q3 + q4 + q5 + q6 * 2);
+  p[2 * step] = ApplyFilter14<Pixel>(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+                                     q3 * 2 + q4 + q5 + q6 * 3);
+  p[3 * step] = ApplyFilter14<Pixel>(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+                                     q4 * 2 + q5 + q6 * 4);
+  p[4 * step] = ApplyFilter14<Pixel>(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+                                     q5 * 2 + q6 * 5);
+  p[5 * step] =
+      ApplyFilter14<Pixel>(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical14(void* dest,
+                                                    ptrdiff_t stride,
+                                                    int outer_thresh,
+                                                    int inner_thresh,
+                                                    int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, 1, flat_thresh)) {
+        if (IsFlatOuter4(dst, 1, flat_thresh)) {
+          Filter14_C(dst, 1);
+        } else {
+          Filter8_C(dst, 1);
+        }
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal14(void* dest,
+                                                      ptrdiff_t stride,
+                                                      int outer_thresh,
+                                                      int inner_thresh,
+                                                      int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, stride, flat_thresh)) {
+        if (IsFlatOuter4(dst, stride, flat_thresh)) {
+          Filter14_C(dst, stride);
+        } else {
+          Filter8_C(dst, stride);
+        }
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+using Defs8bpp = LoopFilterFuncs_C<8, uint8_t>;
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using Defs10bpp = LoopFilterFuncs_C<10, uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = LoopFilterFuncs_C<12, uint16_t>;
+
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void LoopFilterInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+  // Local functions that may be unused depending on the optimizations
+  // available.
+  static_cast<void>(AdjustThresholds);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_filter.h b/src/dsp/loop_filter.h
new file mode 100644 (file)
index 0000000..1ddad71
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+#define LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters. This function is not thread-safe.
+void LoopFilterInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_LOOP_FILTER_H_
diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc
new file mode 100644 (file)
index 0000000..93a273a
--- /dev/null
@@ -0,0 +1,409 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Horizontal and Vertical need 32x32: 8  pixels preceding filtered section
+//                                     16 pixels within filtered section
+//                                     8  pixels following filtered section
+constexpr int kNumPixels = 1024;
+constexpr int kBlockStride = 32;
+
+constexpr int kNumTests = 50000;
+constexpr int kNumSpeedTests = 500000;
+
+template <typename Pixel>
+void InitInput(Pixel* dst, const int stride, const int bitdepth,
+               libvpx_test::ACMRandom& rnd, const uint8_t inner_thresh,
+               const bool transpose) {
+  const int max_pixel = (1 << bitdepth) - 1;
+  const int pixel_range = max_pixel + 1;
+  Pixel tmp[kNumPixels];
+  auto clip_pixel = [max_pixel](int val) {
+    return static_cast<Pixel>(std::max(std::min(val, max_pixel), 0));
+  };
+
+  for (int i = 0; i < kNumPixels;) {
+    const uint8_t val = rnd.Rand8();
+    if (val & 0x80) {  // 50% chance to choose a new value.
+      tmp[i++] = rnd(pixel_range);
+    } else {  // 50% chance to repeat previous value in row X times.
+      int j = 0;
+      while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+        if (i < 1) {
+          tmp[i] = rnd(pixel_range);
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp[i] = clip_pixel(tmp[i - 1] + (inner_thresh - 1));
+        } else {  // Decrement by a value within the limit.
+          tmp[i] = clip_pixel(tmp[i - 1] - (inner_thresh - 1));
+        }
+        ++i;
+      }
+    }
+  }
+
+  for (int i = 0; i < kNumPixels;) {
+    const uint8_t val = rnd.Rand8();
+    if (val & 0x80) {
+      ++i;
+    } else {  // 50% chance to repeat previous value in column X times.
+      int j = 0;
+      while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+        if (i < 1) {
+          tmp[i] = rnd(pixel_range);
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+              tmp[((i - 1) % 32) * 32 + (i - 1) / 32] + (inner_thresh - 1));
+        } else {  // Decrement by a value within the inner_thresh.
+          tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+              tmp[((i - 1) % 32) * 32 + (i - 1) / 32] - (inner_thresh - 1));
+        }
+        ++i;
+      }
+    }
+  }
+
+  for (int i = 0; i < kNumPixels; ++i) {
+    const int offset = transpose ? stride * (i % stride) + i / stride : i;
+    dst[i] = tmp[offset];
+  }
+}
+
+template <int bitdepth, typename Pixel>
+class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  LoopFilterTest() = default;
+  LoopFilterTest(const LoopFilterTest&) = delete;
+  LoopFilterTest& operator=(const LoopFilterTest&) = delete;
+  ~LoopFilterTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopFilterInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    memcpy(base_loop_filters_, dsp->loop_filters[size_],
+           sizeof(base_loop_filters_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_loop_filters_, 0, sizeof(base_loop_filters_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      LoopFilterInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopFilterInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopFilterInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_loop_filters_, dsp->loop_filters[size_],
+           sizeof(cur_loop_filters_));
+
+    for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_loop_filters_[i] == base_loop_filters_[i]) {
+        cur_loop_filters_[i] = nullptr;
+      }
+    }
+  }
+
+  // Check |digests| if non-NULL otherwise print the filter timing.
+  void TestRandomValues(const char* const digests[kNumLoopFilterTypes],
+                        int num_runs) const;
+  void TestSaturatedValues() const;
+
+  const LoopFilterSize size_ = GetParam();
+  LoopFilterFunc base_loop_filters_[kNumLoopFilterTypes];
+  LoopFilterFunc cur_loop_filters_[kNumLoopFilterTypes];
+};
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestRandomValues(
+    const char* const digests[kNumLoopFilterTypes], const int num_runs) const {
+  for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    if (cur_loop_filters_[i] == nullptr) continue;
+
+    libvpx_test::MD5 md5_digest;
+    absl::Duration elapsed_time;
+    for (int n = 0; n < num_runs; ++n) {
+      Pixel dst[kNumPixels];
+      const auto outer_thresh = static_cast<uint8_t>(
+          rnd(3 * kMaxLoopFilterValue - 2) + 7);  // [7, 193].
+      const auto inner_thresh =
+          static_cast<uint8_t>(rnd(kMaxLoopFilterValue) + 1);  // [1, 63].
+      const auto hev_thresh =
+          static_cast<uint8_t>(rnd(kMaxLoopFilterValue + 1) >> 4);  // [0, 3].
+      InitInput(dst, kBlockStride, bitdepth, rnd, inner_thresh, (n & 1) == 0);
+
+      const absl::Time start = absl::Now();
+      cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride,
+                           outer_thresh, inner_thresh, hev_thresh);
+      elapsed_time += absl::Now() - start;
+
+      md5_digest.Add(reinterpret_cast<const uint8_t*>(dst), sizeof(dst));
+    }
+    if (digests == nullptr) {
+      const auto elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+      printf("Mode %s[%25s]: %5d us\n",
+             ToString(static_cast<LoopFilterSize>(size_)),
+             ToString(static_cast<LoopFilterType>(i)), elapsed_time_us);
+    } else {
+      const std::string digest = md5_digest.Get();
+      printf("Mode %s[%25s]: MD5: %s\n",
+             ToString(static_cast<LoopFilterSize>(size_)),
+             ToString(static_cast<LoopFilterType>(i)), digest.c_str());
+      EXPECT_STREQ(digests[i], digest.c_str());
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const {
+  Pixel dst[kNumPixels], ref[kNumPixels];
+  const auto value = static_cast<Pixel>((1 << bitdepth) - 1);
+  for (auto& r : dst) r = value;
+  memcpy(ref, dst, sizeof(dst));
+
+  for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+    if (cur_loop_filters_[i] == nullptr) return;
+    const int outer_thresh = 24;
+    const int inner_thresh = 8;
+    const int hev_thresh = 0;
+    cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride, outer_thresh,
+                         inner_thresh, hev_thresh);
+    ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride,
+                                          kBlockStride, kBlockStride, true))
+        << ToString(static_cast<LoopFilterType>(i))
+        << " output doesn't match reference";
+  }
+}
+
+//------------------------------------------------------------------------------
+
+using LoopFilterTest8bpp = LoopFilterTest<8, uint8_t>;
+
+const char* const* GetDigests8bpp(LoopFilterSize size) {
+  static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+      "6ba725d697d6209cb36dd199b8ffb47a",
+      "7dbb20e456ed0501fb4e7954f49f5e18",
+  };
+  static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+      "89bb757faa44298b7f6e9c1a67f455a5",
+      "be75d5a2fcd83709ff0845f7d83f7006",
+  };
+  static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+      "b09137d68c7b4f8a8a15e33b4b69828f",
+      "ef8a7f1aa073805516d3518a82a5cfa4",
+  };
+  static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+      "6a7bc061ace0888275af88093f82ca08",
+      "a957ddae005839aa41ba7691788b01e4",
+  };
+
+  switch (size) {
+    case kLoopFilterSize4:
+      return kDigestsSize4;
+    case kLoopFilterSize6:
+      return kDigestsSize6;
+    case kLoopFilterSize8:
+      return kDigestsSize8;
+    case kLoopFilterSize14:
+      return kDigestsSize14;
+    default:
+      ADD_FAILURE() << "Unknown loop filter size" << size;
+      return nullptr;
+  }
+}
+
+TEST_P(LoopFilterTest8bpp, DISABLED_Speed) {
+  TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest8bpp, FixedInput) {
+  TestRandomValues(GetDigests8bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest8bpp, SaturatedValues) { TestSaturatedValues(); }
+
+constexpr LoopFilterSize kLoopFilterSizes[] = {
+    kLoopFilterSize4, kLoopFilterSize6, kLoopFilterSize8, kLoopFilterSize14};
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using LoopFilterTest10bpp = LoopFilterTest<10, uint16_t>;
+
+const char* const* GetDigests10bpp(LoopFilterSize size) {
+  static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+      "72e75c478bb130ff1ebfa75f3a70b1a2",
+      "f32d67b611080e0bf1a9d162ff47c133",
+  };
+  static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+      "8aec73c60c87ac7cc6bc9cc5157a2795",
+      "0e4385d3a0cbb2b1551e05ad2b0f07fb",
+  };
+  static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+      "85cb2928fae43e1a27b2fe1b78ba7534",
+      "d044fad9d7c64b93ecb60c88ac48e55f",
+  };
+  static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+      "ebca95ec0db6efbac7ff7cbeabc0e6d0",
+      "754ffaf0ac26a5953a029653bb5dd275",
+  };
+
+  switch (size) {
+    case kLoopFilterSize4:
+      return kDigestsSize4;
+    case kLoopFilterSize6:
+      return kDigestsSize6;
+    case kLoopFilterSize8:
+      return kDigestsSize8;
+    case kLoopFilterSize14:
+      return kDigestsSize14;
+    default:
+      ADD_FAILURE() << "Unknown loop filter size" << size;
+      return nullptr;
+  }
+}
+
+TEST_P(LoopFilterTest10bpp, DISABLED_Speed) {
+  TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest10bpp, FixedInput) {
+  TestRandomValues(GetDigests10bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest10bpp, SaturatedValues) { TestSaturatedValues(); }
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using LoopFilterTest12bpp = LoopFilterTest<12, uint16_t>;
+
+const char* const* GetDigests12bpp(LoopFilterSize size) {
+  static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+      "a14599cbfe2daee633d556a15c47b1f6",
+      "1f0a0794832de1012e2fed6b1cb02e69",
+  };
+  static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+      "c76b24a73139239db10f16f36e01a625",
+      "3f75d904e9dcb1886e84a0f03f60f31e",
+  };
+  static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+      "57c6f0efe2ab3957f5500ca2a9670f37",
+      "caa1f90c2eb2b65b280d678f8fcf6be8",
+  };
+  static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+      "0c58f7466c36c3f4a2c1b4aa1b80f0b3",
+      "63077978326e6dddb5b2c3bfe6d684f5",
+  };
+
+  switch (size) {
+    case kLoopFilterSize4:
+      return kDigestsSize4;
+    case kLoopFilterSize6:
+      return kDigestsSize6;
+    case kLoopFilterSize8:
+      return kDigestsSize8;
+    case kLoopFilterSize14:
+      return kDigestsSize14;
+    default:
+      ADD_FAILURE() << "Unknown loop filter size" << size;
+      return nullptr;
+  }
+}
+
+TEST_P(LoopFilterTest12bpp, DISABLED_Speed) {
+  TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest12bpp, FixedInput) {
+  TestRandomValues(GetDigests12bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest12bpp, SaturatedValues) { TestSaturatedValues(); }
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest12bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+static std::ostream& operator<<(std::ostream& os, const LoopFilterSize size) {
+  return os << ToString(size);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc
new file mode 100644 (file)
index 0000000..eb8052c
--- /dev/null
@@ -0,0 +1,975 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Section 7.17.3.
+// a2: range [1, 256].
+// if (z >= 255)
+//   a2 = 256;
+// else if (z == 0)
+//   a2 = 1;
+// else
+//   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
+// ma = 256 - a2;
+alignas(16) const uint8_t kSgrMaLookup[256] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
+    13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
+    7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,   5,   4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  0};
+
+namespace {
+
+template <int bitdepth, typename Pixel>
+inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
+                             const int width, const int height,
+                             const int16_t* const filter,
+                             const int number_zero_coefficients,
+                             int16_t** wiener_buffer) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int offset =
+      1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  for (int y = 0; y < height; ++y) {
+    int x = 0;
+    do {
+      // sum fits into 16 bits only when bitdepth = 8.
+      int sum = 0;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum +=
+            filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
+      }
+      sum += filter[kCenterTap] * source[x + kCenterTap];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
+      (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
+    } while (++x != width);
+    source += source_stride;
+    *wiener_buffer += width;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void WienerVertical(const int16_t* wiener_buffer, const int width,
+                           const int height, const int16_t* const filter,
+                           const int number_zero_coefficients, void* const dest,
+                           const ptrdiff_t dest_stride) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  auto* dst = static_cast<Pixel*>(dest);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      // sum needs 32 bits.
+      int sum = 0;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum += filter[k] *
+               (wiener_buffer[k * width + x] +
+                wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
+      }
+      sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+      dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
+    } while (++x != width);
+    wiener_buffer += width;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+// Note: bit range for wiener filter.
+// Wiener filter process first applies horizontal filtering to input pixels,
+// followed by rounding with predefined bits (dependent on bitdepth).
+// Then vertical filtering is applied, followed by rounding (dependent on
+// bitdepth).
+// The process is the same as convolution:
+// <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
+// --> <rounding 1>
+// By design:
+// (a). horizontal/vertical filtering adds 7 bits to input.
+// (b). The output of first rounding fits into 16 bits.
+// (c). The output of second rounding fits into 16 bits.
+// If input bitdepth > 8, the accumulator of the horizontal filter is larger
+// than 16 bit and smaller than 32 bits.
+// The accumulator of the vertical filter is larger than 16 bits and smaller
+// than 32 bits.
+// Note: range of wiener filter coefficients.
+// Wiener filter coefficients are symmetric, and their sum is 1 (128).
+// The range of each coefficient:
+// filter[0] = filter[6], 4 bits, min = -5, max = 10.
+// filter[1] = filter[5], 5 bits, min = -23, max = 8.
+// filter[2] = filter[4], 6 bits, min = -17, max = 46.
+// filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
+// The difference from libaom is that in libaom:
+// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
+// Thus in libaom's computation, an offset of 128 is needed for filter[3].
+template <int bitdepth, typename Pixel>
+void WienerFilter_C(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
+
+  // horizontal filtering.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const int16_t* const filter_horizontal =
+      restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+  const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
+  const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
+  const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
+  auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
+
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 0, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 0, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 0,
+                                      &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 1, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 1, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 1,
+                                      &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 2,
+                                      &wiener_buffer);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 3,
+                                      &wiener_buffer);
+  }
+
+  // vertical filtering.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer, wiener_buffer - width,
+           sizeof(*wiener_buffer) * width);
+    memcpy(wiener_buffer_org, wiener_buffer_org + width,
+           sizeof(*wiener_buffer) * width);
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 0, dest, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 1, dest, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 2, dest, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 3, dest, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
+template <typename Pixel, int size>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+                                  const int height, const int width,
+                                  uint16_t* const* sums,
+                                  uint32_t* const* square_sums) {
+  int y = height;
+  do {
+    uint32_t sum = 0;
+    uint32_t square_sum = 0;
+    for (int dx = 0; dx < size; ++dx) {
+      const Pixel source = src[dx];
+      sum += source;
+      square_sum += source * source;
+    }
+    (*sums)[0] = sum;
+    (*square_sums)[0] = square_sum;
+    int x = 1;
+    do {
+      const Pixel source0 = src[x - 1];
+      const Pixel source1 = src[x - 1 + size];
+      sum -= source0;
+      sum += source1;
+      square_sum -= source0 * source0;
+      square_sum += source1 * source1;
+      (*sums)[x] = sum;
+      (*square_sums)[x] = square_sum;
+    } while (++x != width);
+    src += src_stride;
+    ++sums;
+    ++square_sums;
+  } while (--y != 0);
+}
+
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
+template <typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+                                  const int height, const int width,
+                                  uint16_t* const* sum3, uint16_t* const* sum5,
+                                  uint32_t* const* square_sum3,
+                                  uint32_t* const* square_sum5) {
+  int y = height;
+  do {
+    uint32_t sum = 0;
+    uint32_t square_sum = 0;
+    for (int dx = 0; dx < 4; ++dx) {
+      const Pixel source = src[dx];
+      sum += source;
+      square_sum += source * source;
+    }
+    int x = 0;
+    do {
+      const Pixel source0 = src[x];
+      const Pixel source1 = src[x + 4];
+      sum -= source0;
+      square_sum -= source0 * source0;
+      (*sum3)[x] = sum;
+      (*square_sum3)[x] = square_sum;
+      sum += source1;
+      square_sum += source1 * source1;
+      (*sum5)[x] = sum + source0;
+      (*square_sum5)[x] = square_sum + source0 * source0;
+    } while (++x != width);
+    src += src_stride;
+    ++sum3;
+    ++sum5;
+    ++square_sum3;
+    ++square_sum5;
+  } while (--y != 0);
+}
+
+template <int bitdepth, int n>
+inline void CalculateIntermediate(const uint32_t s, uint32_t a,
+                                  const uint32_t b, uint8_t* const ma_ptr,
+                                  uint32_t* const b_ptr) {
+  // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
+  // since max bitdepth = 12, max < 2^31.
+  // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
+  a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
+  // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
+  // d < 2^8 * n < 2^14 regardless of bitdepth
+  const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
+  // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+  // and p itself satisfies p < 2^14 * n^2 < 2^26.
+  // This bound on p is due to:
+  // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+  // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
+  // This is an artifact of rounding, and can only happen if all pixels
+  // are (almost) identical, so in this case we saturate to p=0.
+  const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
+  // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
+  // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
+  // (this holds even after accounting for the rounding in s)
+  const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+  // ma: range [0, 255].
+  const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
+  const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  // ma < 2^8, b < 2^(bitdepth) * n,
+  // one_over_n = round(2^12 / n)
+  // => the product here is < 2^(20 + bitdepth) <= 2^32,
+  // and b is set to a value < 2^(8 + bitdepth).
+  // This holds even with the rounding in one_over_n and in the overall result,
+  // as long as ma is strictly less than 2^8.
+  const uint32_t b2 = ma * b * one_over_n;
+  *ma_ptr = ma;
+  *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+}
+
+template <typename T>
+inline uint32_t Sum343(const T* const src) {
+  return 3 * (src[0] + src[2]) + 4 * src[1];
+}
+
+template <typename T>
+inline uint32_t Sum444(const T* const src) {
+  return 4 * (src[0] + src[1] + src[2]);
+}
+
+template <typename T>
+inline uint32_t Sum565(const T* const src) {
+  return 5 * (src[0] + src[2]) + 6 * src[1];
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
+    uint16_t* const ma565, uint32_t* const b565) {
+  int x = 0;
+  do {
+    uint32_t a = 0;
+    uint32_t b = 0;
+    for (int dy = 0; dy < 5; ++dy) {
+      a += square_sum5[dy][x];
+      b += sum5[dy][x];
+    }
+    CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
+                                        sgr_buffer->b + x);
+  } while (++x != width + 2);
+  x = 0;
+  do {
+    ma565[x] = Sum565(sgr_buffer->ma + x);
+    b565[x] = Sum565(sgr_buffer->b + x);
+  } while (++x != width);
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
+    const int width, const uint32_t s, const bool calculate444,
+    SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
+    uint16_t* const ma444, uint32_t* const b444) {
+  int x = 0;
+  do {
+    uint32_t a = 0;
+    uint32_t b = 0;
+    for (int dy = 0; dy < 3; ++dy) {
+      a += square_sum3[dy][x];
+      b += sum3[dy][x];
+    }
+    CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
+                                       sgr_buffer->b + x);
+  } while (++x != width + 2);
+  x = 0;
+  do {
+    ma343[x] = Sum343(sgr_buffer->ma + x);
+    b343[x] = Sum343(sgr_buffer->b + x);
+  } while (++x != width);
+  if (calculate444) {
+    x = 0;
+    do {
+      ma444[x] = Sum444(sgr_buffer->ma + x);
+      b444[x] = Sum444(sgr_buffer->b + x);
+    } while (++x != width);
+  }
+}
+
+template <typename Pixel>
+inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
+                                   const uint32_t b, const int shift) {
+  const int32_t v = b - ma * src;
+  return RightShiftWithRounding(v,
+                                kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <typename Pixel>
+inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
+                                 const uint16_t* const ma565[2],
+                                 const uint32_t* const b565[2],
+                                 const ptrdiff_t x, int p[2]) {
+  p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
+                                        b565[0][x] + b565[1][x], 5);
+  p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
+}
+
+template <typename Pixel>
+inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
+                                const uint16_t* const ma444,
+                                const uint32_t* const b343[3],
+                                const uint32_t* const b444, const ptrdiff_t x) {
+  const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
+  const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
+  return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedFinal(const int src, const int v) {
+  // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+  // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+  // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
+  // maximum value of each element.
+  const int s = src + RightShiftWithRounding(
+                          v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
+                                        const int filter1, const int16_t w0,
+                                        const int16_t w2) {
+  const int v = w0 * filter0 + w2 * filter1;
+  return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
+                                        const int16_t w0) {
+  const int v = w0 * filter;
+  return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
+                           uint16_t* const sum5[5],
+                           uint32_t* const square_sum5[5], const int width,
+                           const uint32_t scale, const int16_t w0,
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma565[2], uint32_t* const b565[2],
+                           Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                 ma565[1], b565[1]);
+  int x = 0;
+  do {
+    int p[2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
+    dst[stride + x] =
+        SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
+                           const int width, const uint16_t scale,
+                           const int16_t w0, uint16_t* const sum3[4],
+                           uint32_t* const square_sum3[4],
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma343[4], uint16_t* const ma444[3],
+                           uint32_t* const b343[4], uint32_t* const b444[3],
+                           Pixel* dst) {
+  BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  int x = 0;
+  do {
+    const int p =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
+                      uint16_t* const sum3[4], uint16_t* const sum5[5],
+                      uint32_t* const square_sum3[4],
+                      uint32_t* const square_sum5[5], const int width,
+                      const uint16_t scales[2], const int16_t w0,
+                      const int16_t w2, SgrBuffer* const sgr_buffer,
+                      uint16_t* const ma343[4], uint16_t* const ma444[3],
+                      uint16_t* const ma565[2], uint32_t* const b343[4],
+                      uint32_t* const b444[3], uint32_t* const b565[2],
+                      Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                 sgr_buffer, ma565[1], b565[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+                                 true, sgr_buffer, ma343[3], b343[3], ma444[2],
+                                 b444[2]);
+  int x = 0;
+  do {
+    int p[2][2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
+    p[1][0] =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
+                                          b343 + 1, b444[1], x);
+    dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+                                                         p[1][0], w0, w2);
+    dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+        src[stride + x], p[0][1], p[1][1], w0, w2);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
+                             const Pixel* src, const ptrdiff_t stride,
+                             const Pixel* const top_border,
+                             const ptrdiff_t top_border_stride,
+                             const Pixel* bottom_border,
+                             const ptrdiff_t bottom_border_stride,
+                             const int width, const int height,
+                             SgrBuffer* const sgr_buffer, Pixel* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
+                square_sum3, square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
+                square_sum5 + 4);
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                 sgr_buffer, ma565[0], b565[0]);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+                                 sgr_buffer, ma343[0], b343[0], nullptr,
+                                 nullptr);
+  BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+                                 true, sgr_buffer, ma343[1], b343[1], ma444[0],
+                                 b444[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = bottom_border_stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
+                  width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                  square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
+    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                   sgr_buffer, ma565[1], b565[1]);
+    BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+                                   sgr_buffer, ma343[2], b343[2], nullptr,
+                                   nullptr);
+    int x = 0;
+    do {
+      const int p0 = CalculateFilteredOutput<Pixel>(
+          src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+      const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
+                                                 b444[0], x);
+      dst[x] =
+          SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
+    } while (++x != width);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const Pixel* src, const ptrdiff_t stride,
+                                  const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
+                   square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                 ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
+                     square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = bottom_border_stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
+                     1, width + 2, sum5 + 3, square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
+    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                   ma565[1], b565[1]);
+    int x = 0;
+    do {
+      const int p = CalculateFilteredOutput<Pixel>(
+          src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+      dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+    } while (++x != width);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const Pixel* src, const ptrdiff_t stride,
+                                  const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
+                   square_sum3);
+  BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
+                                 sgr_buffer, ma343[0], b343[0], nullptr,
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const Pixel* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[1], b343[1], ma444[0],
+                                 b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
+                                    sum3, square_sum3, sgr_buffer, ma343, ma444,
+                                    b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
+                                    square_sum3, sgr_buffer, ma343, ma444, b343,
+                                    b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilter_C(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* src = static_cast<const Pixel*>(source);
+  const auto* top = static_cast<const Pixel*>(top_border);
+  const auto* bottom = static_cast<const Pixel*>(bottom_border);
+  auto* dst = static_cast<Pixel*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2<bitdepth, Pixel>(
+        restoration_info, src - 2, stride, top - 2, top_border_stride,
+        bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void LoopRestorationInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_restoration.h b/src/dsp/loop_restoration.h
new file mode 100644 (file)
index 0000000..8fefc40
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+#define LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_restoration_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_restoration_avx2.h"
+#include "src/dsp/x86/loop_restoration_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+extern const uint8_t kSgrMaLookup[256];
+
+// Initializes Dsp::loop_restorations. This function is not thread-safe.
+void LoopRestorationInit_C();
+
+template <typename T>
+void Circulate3PointersBy1(T* p[3]) {
+  T* const p0 = p[0];
+  p[0] = p[1];
+  p[1] = p[2];
+  p[2] = p0;
+}
+
+template <typename T>
+void Circulate4PointersBy2(T* p[4]) {
+  std::swap(p[0], p[2]);
+  std::swap(p[1], p[3]);
+}
+
+template <typename T>
+void Circulate5PointersBy2(T* p[5]) {
+  T* const p0 = p[0];
+  T* const p1 = p[1];
+  p[0] = p[2];
+  p[1] = p[3];
+  p[2] = p[4];
+  p[3] = p0;
+  p[4] = p1;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc
new file mode 100644 (file)
index 0000000..d6dcd9c
--- /dev/null
@@ -0,0 +1,702 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// in unit of Pixel.
+constexpr int kBorder = 16;
+constexpr int kWidth = 256;
+constexpr int kHeight = 255;
+constexpr int kStride = kWidth + 2 * kBorder;
+constexpr int kOffset = kBorder * kStride + kBorder;
+constexpr int kMaxBlockSize = 288 * kStride;
+constexpr int kUnitWidths[] = {32, 64, 128, 256};
+
+constexpr int kNumRadiusTypes = 3;
+constexpr int kNumWienerOrders = 4;
+constexpr int kWienerOrders[] = {7, 5, 3, 1};
+constexpr int kWienerOrderIdLookup[] = {0, 3, 0, 2, 0, 1, 0, 0};
+
+template <int bitdepth, typename Pixel>
+class SelfGuidedFilterTest : public testing::TestWithParam<int>,
+                             public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  SelfGuidedFilterTest() = default;
+  SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete;
+  SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete;
+  ~SelfGuidedFilterTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopRestorationInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+      LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_AVX2();
+#endif
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_SSE4_1();
+#endif
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    target_self_guided_filter_func_ = dsp->loop_restorations[1];
+    restoration_info_.type = kLoopRestorationTypeSgrProj;
+    memset(dst_, 0, sizeof(dst_));
+  }
+
+  void SetInputData(int type, Pixel value, int radius_index,
+                    libvpx_test::ACMRandom* rnd);
+  void TestFixedValues(int test_index, Pixel value);
+  void TestRandomValues(bool speed);
+
+ protected:
+  const int unit_width_ = GetParam();
+  const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+  alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+  RestorationUnitInfo restoration_info_;
+  RestorationBuffer restoration_buffer_;
+  LoopRestorationFunc target_self_guided_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData(
+    int type, Pixel value, int radius_index,
+    libvpx_test::ACMRandom* const rnd) {
+  const int mask = (1 << bitdepth) - 1;
+  if (type == 0) {  // Set fixed values
+    for (auto& s : src_) s = value;
+  } else {  // Set random values
+    for (auto& s : src_) s = rnd->Rand16() & mask;
+  }
+  for (auto& d : dst_) d = rnd->Rand16() & mask;
+  restoration_info_.sgr_proj_info.multiplier[0] =
+      kSgrProjMultiplierMin[0] +
+      rnd->PseudoUniform(kSgrProjMultiplierMax[0] - kSgrProjMultiplierMin[0] +
+                         1);
+  restoration_info_.sgr_proj_info.multiplier[1] =
+      kSgrProjMultiplierMin[1] +
+      rnd->PseudoUniform(kSgrProjMultiplierMax[1] - kSgrProjMultiplierMin[1] +
+                         1);
+  // regulate multiplier so that it matches libaom.
+  // Valid self-guided filter doesn't allow r0 and r1 to be 0 at the same time.
+  // When r0 or r1 is zero, its corresponding multiplier is set to zero in
+  // libaom.
+  int index;
+  if (radius_index == 0) {
+    index = 0;  // r0 = 2, r1 = 1
+  } else if (radius_index == 1) {
+    index = 10;  // r0 = 0, r1 = 1
+  } else /* if (radius_index == 2) */ {
+    index = 14;  // r0 = 2, r1 = 0
+  }
+  const uint8_t r0 = kSgrProjParams[index][0];
+  const uint8_t r1 = kSgrProjParams[index][2];
+  static constexpr int kMultiplier[2] = {0, 95};
+  restoration_info_.sgr_proj_info.index = index;
+  if (r0 == 0) {
+    restoration_info_.sgr_proj_info.multiplier[0] = kMultiplier[0];
+  } else if (r1 == 0) {
+    restoration_info_.sgr_proj_info.multiplier[1] = kMultiplier[1];
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index,
+                                                            Pixel value) {
+  static const char* const kDigest[][3][kNumRadiusTypes] = {
+      {{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0",
+        "a03314fc210bee68c7adbb44d2bbdac7"},
+       {"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54",
+        "a6583fe9359877f4a259c81d900fc4fb"},
+       {"8f9b6944c8965f34d444a667da3b0ebe", "84fa62c491c67c3a435fd5140e7a4f82",
+        "d04b62d97228789e5c6928d40d5d900e"}},
+      {{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a",
+        "27100f37b3e42a5f2a051e1566edb6f8"},
+       {"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3",
+        "69c274ac59c99999e1bfbf2fc4586ebd"},
+       {"86ff2318bf8a584b8d5edd710681d621", "f6e1c104a764d6766cc278d5b216855a",
+        "6d928703526ab114efba865ff5b11886"}},
+      {{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4",
+        "92f31086ba2f9e1508983b22d93a4e5c"},
+       {"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13",
+        "43dd7df2c2a601262c68cd8af1c61b82"},
+       {"1ab6138c3a82ac8ccd840f0553fdfb58", "be3bf92633f7165d3ad9c327d2dd53fe",
+        "41115efff3adeb541e04db23faa22f23"}},
+      {{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4",
+        "f8a6a025827f29f857bed3e28ba3ea33"},
+       {"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42",
+        "20dcbe8e317a4373bebf11d56adc5f02"},
+       {"7971a60337fcdb662c92db051bd0bb41", "75f89f346c2a37bf0c6695c0482531e6",
+        "1595eeacd62cdce4d2fb094534c22c1e"}}};
+  if (target_self_guided_filter_func_ == nullptr) return;
+  ASSERT_LT(value, 1 << bitdepth);
+  constexpr int bd_index = (bitdepth - 8) / 2;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+    SetInputData(0, value, radius_index, &rnd);
+    const absl::Time start = absl::Now();
+    for (int y = 0; y < kHeight; y += unit_height_) {
+      const int height = std::min(unit_height_, kHeight - y);
+      for (int x = 0; x < kWidth; x += unit_width_) {
+        const int width = std::min(unit_width_, kWidth - x);
+        const Pixel* const source = src + y * kStride + x;
+        target_self_guided_filter_func_(
+            restoration_info_, source, kStride,
+            source - kRestorationVerticalBorder * kStride, kStride,
+            source + height * kStride, kStride, width, height,
+            &restoration_buffer_, dst + y * kStride + x);
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+        kDigest[test_index][bd_index][radius_index], dst_ + kBorder * kStride,
+        kHeight * kStride * sizeof(*dst_), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+  static const char* const kDigest[][3][kNumRadiusTypes] = {
+      {{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb",
+        "ca67159cd29475ac5d52ca4a0df3ea10"},
+       {"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678",
+        "a8ba988283d9e1ad1f0dcdbf6bbdaade"},
+       {"d95e98d031f9ba290e5183777d1e4905", "f806853cfadb50e6dbd4898412b92934",
+        "741fbfdb79cda695afedda3d51dbb27f"}},
+      {{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9",
+        "a4005899fa8d3c3c4669910f93ff1290"},
+       {"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e",
+        "07203ad761775d5d317f2b7884afd9fe"},
+       {"76b9ef906090fa81af64cce3bba0a54a", "8eecc59acdef8953aa9a96648c0ccd2c",
+        "6e45a0ef60e0475f470dc93552047f07"}},
+      {{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084",
+        "475bcb6a58f87da7723f6227bc2aca0e"},
+       {"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f",
+        "7cb5c5dbdb3d1c54cfa00def450842dc"},
+       {"0e3dc23150d18c9d366d15e174727311", "8495122917770d822f1842ceff987b03",
+        "4aeb9db902072cefd6af0aff8aaabd24"}},
+      {{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9",
+        "f1eda6d15b37172199d9949c2315832f"},
+       {"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8",
+        "b23dc0b54c3500248d53377030428a61"},
+       {"9c331f2b9410354685fe904f6c022dfa", "b540b0045b7723fbe962fd675db4b077",
+        "3cecd1158126c9c9cc2873ecc8c1a135"}},
+      {{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36",
+        "23966cba3e0e7803eeb951905861e0dd"},
+       {"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de",
+        "dcee48f94126a2132963e86e93dd4903"},
+       {"beb3dd8a2dbc5f83ef171b0ffcead3ab", "c373bd9c46bdb89a3d1e41759c315025",
+        "cd407b212ab46fd4a451d5dc93a0ce4a"}}};
+  if (target_self_guided_filter_func_ == nullptr) return;
+  constexpr int bd_index = (bitdepth - 8) / 2;
+  const int num_inputs = speed ? 1 : 5;
+#if LIBGAV1_ENABLE_NEON
+  const int num_tests = speed ? 4000 : 1;
+#else
+  const int num_tests = speed ? 10000 : 1;
+#endif
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (int i = 0; i < num_inputs; ++i) {
+    for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+      SetInputData(1, 0, radius_index, &rnd);
+      const absl::Time start = absl::Now();
+      for (int k = 0; k < num_tests; ++k) {
+        for (int y = 0; y < kHeight; y += unit_height_) {
+          const int height = std::min(unit_height_, kHeight - y);
+          for (int x = 0; x < kWidth; x += unit_width_) {
+            const int width = std::min(unit_width_, kWidth - x);
+            const Pixel* const source = src + y * kStride + x;
+            target_self_guided_filter_func_(
+                restoration_info_, source, kStride,
+                source - kRestorationVerticalBorder * kStride, kStride,
+                source + height * kStride, kStride, width, height,
+                &restoration_buffer_, dst + y * kStride + x);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+          kDigest[i][bd_index][radius_index], dst_ + kBorder * kStride,
+          kHeight * kStride * sizeof(*dst_), elapsed_time);
+    }
+  }
+}
+
+using SelfGuidedFilterTest8bpp = SelfGuidedFilterTest<8, uint8_t>;
+
+TEST_P(SelfGuidedFilterTest8bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 128);
+  TestFixedValues(3, 255);
+  TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SelfGuidedFilterTest10bpp = SelfGuidedFilterTest<10, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest10bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 512);
+  TestFixedValues(3, 1023);
+  TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using SelfGuidedFilterTest12bpp = SelfGuidedFilterTest<12, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest12bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 2048);
+  TestFixedValues(3, 4095);
+  TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest12bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth, typename Pixel>
+class WienerFilterTest : public testing::TestWithParam<int>,
+                         public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  WienerFilterTest() = default;
+  WienerFilterTest(const WienerFilterTest&) = delete;
+  WienerFilterTest& operator=(const WienerFilterTest&) = delete;
+  ~WienerFilterTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopRestorationInit_C();
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_wiener_filter_func_ = dsp->loop_restorations[0];
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+      LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_AVX2();
+#endif
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_SSE4_1();
+#endif
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    target_wiener_filter_func_ = dsp->loop_restorations[0];
+    restoration_info_.type = kLoopRestorationTypeWiener;
+    memset(dst_, 0, sizeof(dst_));
+    memset(tmp_, 0, sizeof(tmp_));
+    memset(buffer_, 0, sizeof(buffer_));
+  }
+
+  static void CleanFilterByOrder(const int order,
+                                 int16_t filter[kWienerFilterTaps]) {
+    if (order <= 5) filter[0] = 0;
+    if (order <= 3) filter[1] = 0;
+    if (order <= 1) filter[2] = 0;
+  }
+
+  void SetInputData(int type, Pixel value, int vertical_order,
+                    int horizontal_order);
+  void TestFixedValues(int digest_id, Pixel value);
+  void TestRandomValues(bool speed);
+  void TestCompare2C();
+
+ protected:
+  const int unit_width_ = GetParam();
+  const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+  alignas(kMaxAlignment)
+      uint16_t buffer_[(kRestorationUnitWidth + kWienerFilterTaps - 1) *
+                       kRestorationUnitHeight];
+  alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel tmp_[kMaxBlockSize];
+  RestorationUnitInfo restoration_info_;
+  RestorationBuffer restoration_buffer_;
+  LoopRestorationFunc base_wiener_filter_func_;
+  LoopRestorationFunc target_wiener_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::SetInputData(
+    int type, Pixel value, const int vertical_order,
+    const int horizontal_order) {
+  const int mask = (1 << bitdepth) - 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  if (type == 0) {
+    for (auto& s : src_) s = value;
+  } else {
+    for (auto& s : src_) s = rnd.Rand16() & mask;
+  }
+  int order = vertical_order;
+  for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+    auto& filter = restoration_info_.wiener_info.filter[i];
+    filter[3] = 128;
+    for (int j = 0; j < 3; ++j) {
+      filter[j] = kWienerTapsMin[j] +
+                  rnd.PseudoUniform(kWienerTapsMax[j] - kWienerTapsMin[j] + 1);
+    }
+    CleanFilterByOrder(order, filter);
+    filter[3] -= 2 * (filter[0] + filter[1] + filter[2]);
+    restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+        (kWienerFilterTaps - order) / 2;
+    order = horizontal_order;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id,
+                                                        Pixel value) {
+  static const char* const kDigest[3][4] = {
+      {"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d",
+       "1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"},
+      {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+       "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"},
+      {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+       "961eeb92bd9d85eb47e3961ee93d279a", "039a279232bc90eebc0ec2fe3e18a7e1"},
+  };
+  if (target_wiener_filter_func_ == nullptr) return;
+  ASSERT_LT(value, 1 << bitdepth);
+  constexpr int bd_index = (bitdepth - 8) / 2;
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(0, value, vertical_order, horizontal_order);
+      memset(dst_, 0, sizeof(dst_));
+      const absl::Time start = absl::Now();
+      for (int y = 0; y < kHeight; y += unit_height_) {
+        const int height = std::min(unit_height_, kHeight - y);
+        for (int x = 0; x < kWidth; x += unit_width_) {
+          const int width = std::min(unit_width_, kWidth - x);
+          const Pixel* const source = src + y * kStride + x;
+          target_wiener_filter_func_(
+              restoration_info_, source, kStride,
+              source - kRestorationVerticalBorder * kStride, kStride,
+              source + height * kStride, kStride, width, height,
+              &restoration_buffer_, dst + y * kStride + x);
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+          kDigest[bd_index][digest_id], dst_, sizeof(dst_), elapsed_time);
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+  static const char* const kDigest[3][kNumWienerOrders][kNumWienerOrders] = {
+      {{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b",
+        "545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"},
+       {"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9",
+        "6136ecb4e29b17c9566504148943fd47", "c5ee2da81d44dc8cb2ac8021f724eb7a"},
+       {"125cbb227313ec91a2683f26e6f049d1", "77671b6529c806d23b749f304b548f59",
+        "28d53a1b486881895b8f73fa64486df1", "f5e32165bafe575d7ee7a6fbae75f36d"},
+       {"e832c41f2566ab542b32abba9d4f27bd", "ab1336ee6b85cba651f35ee5d3b3cc5c",
+        "52a673b6d14fbdca5ebdb1a34ee3326f",
+        "ebb42c7c9111f2e39f21e2158e801d9e"}},
+      {{"8cd9c6bd9983bd49564a58ed4af9098a", "f71f333c9d71237ed4e46f0ef2283196",
+        "375b43abc1d6682d62f91c1841b8b0fc", "71e2444822ae9c697ddfc96e07c6e8a1"},
+       {"d9ed3a66ceef405c08c87f6e91b71059", "c171fcff5fb7bb919f13ead7a4917a4c",
+        "8fbd1edb82fcd78d4d286886f65a700a", "fe14a143e6b261c5bb07b179d40be5a2"},
+       {"1c995f4e7f117857de73211b81093bd0", "5ab1ee3bb14adcd66d66802d58bee068",
+        "d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"},
+       {"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf",
+        "7cbc1562a9dd08e1973b3b9ac1afc765",
+        "3c91bf1a34672cd40bf261c5820d3ec3"}},
+      {{"501b57370c781372b514accd03d161af", "a4569b5eff7f7e8b696934d192619be5",
+        "24eb2aa43118a8822f7a6a7384ab9ea7", "edd7ac227733b5a4496bfdbdf4eb34d7"},
+       {"77624cf73299a1bd928eae3eb8945dbe", "b3f311cacbf45fa892761462d31b2598",
+        "977c063d93a4b95cb365363763faa4da", "02313c9d360a1e0180ed05d3e4444c3d"},
+       {"f499655ecdcbe0ac48553f1eee758589", "a009c83c03e47cbd05c1243e28579bd9",
+        "d5f0b4fd761ff51efce949e6c5ec4833", "e3a9a57aacd2e6cfe0f792a885b3e0e3"},
+       {"b4cf906e9bb02ffca15c1e9575962ca2", "d0ca9f933978c0c31175ba1b28a44ae8",
+        "81ac1475530ffbd1c8d3ce7da87ffe6b",
+        "b96412949c2e31b29388222ac8914fa2"}},
+  };
+  if (target_wiener_filter_func_ == nullptr) return;
+  constexpr int bd_index = (bitdepth - 8) / 2;
+#if LIBGAV1_ENABLE_NEON
+  const int num_tests = speed ? 5000 : 1;
+#else
+  const int num_tests = speed ? 10000 : 1;
+#endif
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+      memset(dst_, 0, sizeof(dst_));
+      const absl::Time start = absl::Now();
+      for (int i = 0; i < num_tests; ++i) {
+        for (int y = 0; y < kHeight; y += unit_height_) {
+          const int height = std::min(unit_height_, kHeight - y);
+          for (int x = 0; x < kWidth; x += unit_width_) {
+            const int width = std::min(unit_width_, kWidth - x);
+            const Pixel* const source = src + y * kStride + x;
+            target_wiener_filter_func_(
+                restoration_info_, source, kStride,
+                source - kRestorationVerticalBorder * kStride, kStride,
+                source + height * kStride, kStride, width, height,
+                &restoration_buffer_, dst + y * kStride + x);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+          kDigest[bd_index][kWienerOrderIdLookup[vertical_order]]
+                 [kWienerOrderIdLookup[horizontal_order]],
+          dst_, sizeof(dst_), elapsed_time);
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestCompare2C() {
+  if (base_wiener_filter_func_ == nullptr) return;
+  if (target_wiener_filter_func_ == nullptr) return;
+  if (base_wiener_filter_func_ == target_wiener_filter_func_) return;
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  Pixel* const tmp = tmp_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+      for (int x = 0; x < 2; ++x) {
+        // Prepare min/max filter coefficients.
+        int order = vertical_order;
+        for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+          auto& filter = restoration_info_.wiener_info.filter[i];
+          for (int j = 0; j < 3; ++j) {
+            filter[j] = (x == 0) ? kWienerTapsMin[j] : kWienerTapsMax[j];
+          }
+          CleanFilterByOrder(order, filter);
+          filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]);
+          restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+              (kWienerFilterTaps - order) / 2;
+          order = horizontal_order;
+        }
+        base_wiener_filter_func_(restoration_info_, src, kStride,
+                                 src - kRestorationVerticalBorder * kStride,
+                                 kStride, src + unit_height_ * kStride, kStride,
+                                 unit_width_, unit_height_,
+                                 &restoration_buffer_, dst);
+        target_wiener_filter_func_(restoration_info_, src, kStride,
+                                   src - kRestorationVerticalBorder * kStride,
+                                   kStride, src + unit_height_ * kStride,
+                                   kStride, unit_width_, unit_height_,
+                                   &restoration_buffer_, tmp);
+        if (!test_utils::CompareBlocks(dst, tmp, unit_width_, unit_height_,
+                                       kStride, kStride, false, true)) {
+          ADD_FAILURE() << "Mismatch -- wiener taps min/max";
+        }
+      }
+    }
+  }
+}
+
+using WienerFilterTest8bpp = WienerFilterTest<8, uint8_t>;
+
+TEST_P(WienerFilterTest8bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 128);
+  TestFixedValues(3, 255);
+  TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest8bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WienerFilterTest10bpp = WienerFilterTest<10, uint16_t>;
+
+TEST_P(WienerFilterTest10bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 512);
+  TestFixedValues(3, 1023);
+  TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest10bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WienerFilterTest12bpp = WienerFilterTest<12, uint16_t>;
+
+TEST_P(WienerFilterTest12bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 2048);
+  TestFixedValues(3, 4095);
+  TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest12bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest12bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
new file mode 100644 (file)
index 0000000..34d7fe8
--- /dev/null
@@ -0,0 +1,258 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+uint8_t GetMaskValue(const uint8_t* LIBGAV1_RESTRICT mask,
+                     const uint8_t* LIBGAV1_RESTRICT mask_next_row, int x,
+                     int subsampling_x, int subsampling_y) {
+  if ((subsampling_x | subsampling_y) == 0) {
+    return mask[x];
+  }
+  if (subsampling_x == 1 && subsampling_y == 0) {
+    return static_cast<uint8_t>(RightShiftWithRounding(
+        mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1], 1));
+  }
+  assert(subsampling_x == 1 && subsampling_y == 1);
+  return static_cast<uint8_t>(RightShiftWithRounding(
+      mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1] +
+          mask_next_row[MultiplyBy2(x)] + mask_next_row[MultiplyBy2(x) + 1],
+      2));
+}
+
+template <int bitdepth, typename Pixel, bool is_inter_intra, int subsampling_x,
+          int subsampling_y>
+void MaskBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                 const void* LIBGAV1_RESTRICT prediction_1,
+                 const ptrdiff_t prediction_stride_1,
+                 const uint8_t* LIBGAV1_RESTRICT mask,
+                 const ptrdiff_t mask_stride, const int width, const int height,
+                 void* LIBGAV1_RESTRICT dest, const ptrdiff_t dest_stride) {
+  static_assert(!(bitdepth == 8 && is_inter_intra), "");
+  assert(mask != nullptr);
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+  constexpr int step_y = subsampling_y ? 2 : 1;
+  const uint8_t* mask_next_row = mask + mask_stride;
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value =
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
+      if (is_inter_intra) {
+        dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+            mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+      } else {
+        assert(prediction_stride_1 == width);
+        int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+        res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+        dst[x] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+                  (1 << bitdepth) - 1));
+      }
+    }
+    dst += dst_stride;
+    mask += mask_stride * step_y;
+    mask_next_row += mask_stride * step_y;
+    pred_0 += width;
+    pred_1 += prediction_stride_1;
+  }
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_C(const uint8_t* LIBGAV1_RESTRICT prediction_0,
+                               uint8_t* LIBGAV1_RESTRICT prediction_1,
+                               const ptrdiff_t prediction_stride_1,
+                               const uint8_t* LIBGAV1_RESTRICT mask,
+                               const ptrdiff_t mask_stride, const int width,
+                               const int height) {
+  assert(mask != nullptr);
+  constexpr int step_y = subsampling_y ? 2 : 1;
+  const uint8_t* mask_next_row = mask + mask_stride;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value =
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
+      prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
+          mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
+          6));
+    }
+    mask += mask_stride * step_y;
+    mask_next_row += mask_stride * step_y;
+    prediction_0 += width;
+    prediction_1 += prediction_stride_1;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->mask_blend[0][1] = nullptr;
+  dsp->mask_blend[1][1] = nullptr;
+  dsp->mask_blend[2][1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+#endif
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->mask_blend[0][1] = nullptr;
+  dsp->mask_blend[1][1] = nullptr;
+  dsp->mask_blend[2][1] = nullptr;
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#endif
+  static_cast<void>(GetMaskValue);
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+  dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+  dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+  dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+  dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+  dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+  dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+#endif
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
+  dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+  dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+  dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra444
+  dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra422
+  dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra420
+  dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+#endif
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void MaskBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/mask_blend.h b/src/dsp/mask_blend.h
new file mode 100644 (file)
index 0000000..41f5e5b
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MASK_BLEND_H_
+#define LIBGAV1_SRC_DSP_MASK_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/mask_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/mask_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend and Dsp::inter_intra_mask_blend_8bpp. This
+// function is not thread-safe.
+void MaskBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MASK_BLEND_H_
diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc
new file mode 100644 (file)
index 0000000..06793e5
--- /dev/null
@@ -0,0 +1,602 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+// mask_blend is applied to compound prediction values when is_inter_intra is
+// false. This implies a range far exceeding that of pixel values. The ranges
+// include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "4b70d5ef5ac7554b4b2660a4abe14a41", "64adb36f07e4a2c4ea4f05cfd715ff58",
+      "2cd162cebf99724a3fc22d501bd8c8e4", "c490478208374a43765900ef7115c264",
+      "b98f222eb70ef8589da2d6c839ca22b8", "54752ca05f67b5af571bc311aa4e3de3",
+      "5ae48814dd285bfca4f5ee8e339dca99", "383f3f4f47563f065d1b6068e5931a24",
+      "344b2dab7accd8bd0a255bee16207336", "0b2f6f755d1547eea7e0172f8133ea01",
+      "310dc6364fdacba186c01f0e8ac4fcb7", "c2ee4673078d34971319c77ca77b23d1",
+      "b0c9f08b73d9e5c16eaf5abdbca1fdc0", "eaad805999d949fa1e1bbbb63b4b7827",
+      "6eb2a80d212df89403efb50db7a81b08", "c30730aa799dba78a2ebd3f729af82c7",
+      "4346c2860b23f0072b6b288f14c1df36", "1cdace53543063e129a125c4084ca5d7",
+      "1ae5328e0c0f4f2bec640d1af03b2978", "3860e040fbee0c5f68f0b4af769209b3",
+      "e9480ded15d9c38ee19bf5fa816dd296", "4e17c222b64f428df29938a8120ca256",
+      "2a943bc6de9b29c8bcae189ad3bec276", "b5a6bc02c76fa61040678fb2c6c112d2",
+      "2c11bb9bd29c5577194edb77cfd1c614", "31ed1832810ae385f4ad8f57795dde1e",
+      "eb87d647839c33984dfb25bac0e7cdb3", "f652ec2b1478e35acb19cf28042ee849",
+      "0cfb18ac0cb94af1447bcac32ac20c36", "e152bbbf5ee4b40b7b41ec1f2e901aaa",
+      "f17f78fd485f7beafa8126c1cda801d7", "9f9fbee0cc9d99435efd3dff644be273",
+      "9b498843d66440c1e68dc7ab04f57d42", "2f2b0beceb31b79ccb9179991629e4b8",
+      "e06a6ebb6791529bb23fe5b0a9914220", "2b3d1ff19812a17c17b1be1f1727815e",
+      "d0bbdecec414950ed63a8a35c2bae397", "8e53906c6513058d7f17013fe0d32bf1",
+      "be0690efd31f0bf3c2adcd27ca011ed5", "c2b26243c5f147fdeadf52735aa68fb5",
+      "94bb83e774d9189c5ee04fb361855e19", "dad6441e723791a91f31a56b2136fd33",
+      "10ccac76a2debb842a0685a527b6a659", "346fb0a4914b64dda3ca0f521412b999",
+      "d7e400b855502bbb4f2b8294e207bb96", "3487503f2d73ec52f25b5e8d06c81da4",
+      "3f49c096acfcf46d44ce18b48debca7c", "8ed6a745a2b5457ac7f3ac145ce57e72",
+      "21f9dda5ef934a5ee6274b22cc22f93b", "507b60611afeb373384d9b7606f7ea46",
+      "ac766fadcdb85a47ad14a6846b9e5c36", "fde149bc2162e02bbc5fa85cc41641a5",
+      "f5f094b5742d0a920ba734b017452d24", "c90d06b0c76a0983bd1428df2a1b64b3",
+      "3649e6a6ed9f69e3f78e0b75160fb82a", "1d44b7649497e651216db50d325e3073",
+      "948fa112e90e3ca4d15f3d2f2acfab9a", "9bb54c0f7d07c0b44c44ba09379a04ff",
+      "228261ab6f098f489a8968cff1e1f7ae", "5e128db7462164f7327d1d8feeb2e4c7",
+      "9e8b97f6d9d482d5770b138bd1077747", "81563d505a4e8dd779a089abf2a28b77",
+      "b7157451de7cfa161dff1afd7f9b8622", "6a25cc0a4aaf8a315d1158dbb0ec2966",
+      "303867ee010ba51da485ee10149c6f9b", "63b64b7527d2476e9ae5139b8166e8c9",
+      "cfa93c2aeeb27a1190a445a6fee61e15", "804bcff8709665eed6830e24346101be",
+      "829947ed3e90776cda4ae82918461497", "1df10a1cb80c1a81f521e7e0f80b4f99",
+      "3c9593e42ac574f3555bb8511d438a54", "eecef71492c0626685815e646f728f79",
+      "0c43d59f456ddca2449e016ae4e34be7", "207d4ac2579f1271fc9eca8d743917b3",
+      "3c472bb0b1c891ffda19077ebb659e48", "a4ae7a0d25113bc0238fa27409f9c0dd",
+      "e8ad037ca81f46774bb01d20f46671ce", "b22741e4fe0e4062e40a2decec102ffd",
+      "c72f9e7bc0170163cb94da0faa0d3ffb", "accaf5d475d155cbd3a8c113f90718bc",
+      "2fd31e72444ea258380c16881580de81", "8a6a2a253f6f5b0ff75ba39488e6b082",
+      "c5e8159c0f3ebb7536e84ab3dadac1b3", "ef7ec20b46c7dcf16591835642bd68ef",
+      "0c3425399dc64870d726c2837666a55e", "0365029ffbfc4cedf3bf2d757ea5b9df",
+      "836aa403254af2e04d4b7a7c4db8bfc5", "7f2f3f9c91677b233795169f9a88b2b2",
+      "9fc8bbe787244dac638c367b9c611d13", "f66ef45fae8e163ab0f0f393531dad26",
+      "beb984e88b6f9b96ae6efe5da23ad16b", "1083b829ea766b1d4eb0bb96e9fb3bff",
+      "be8abad1da69e4d238a45fc02a0061cf",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1af3cbd1616941b59e6a3f6a417b6312", "1d8b3f4b9d5d2f4ff5be8e81b7243121",
+      "e767350f150a84ac5a06dc348e815d62", "53a3a76bf2bcd5761cd15fc739a4f4e1",
+      "7597f69dc19a584280be0d67911db6a6", "e1221c172843dc6c1b345bcd370771cc",
+      "1a640c71ff9bb45505d89761f19efa8f", "e192f64322e0edb250b52f63aaa4de97",
+      "2ccbe012ca167114b14c3ba70befa960", "0f68632d7e5faddb4554ca430d1df822",
+      "8caa0061a26e142b783951d5abd7bf5d", "b01eeed3ec549e4a593100d9c5ba587a",
+      "1cce6acdbd8ca8d2546ba937584730bf", "022913e87a3c1a86aaefe2c2d4f89882",
+      "48f8ab636ba15a06731d869b603cbe58", "ba1616c990d224c20de123c3ccf19952",
+      "346a797b7cb4de10759e329f8b49e077", "d4929154275255f2d786d6fc42c7c5d3",
+      "18a6af6f36ca1ea4ab6f5a76505de040", "0c43e68414bfc02f9b20e796506f643b",
+      "9f483f543f6b1d58e23abf9337ed6fe6", "e114860c2538b63f1be4a23560420cdc",
+      "da8680798f96572c46155c7838b452c3", "20b47a27617297231843c0f2ed7b559b",
+      "16fa4a4f33a32e28c79da83dca63fd41", "76e2c1d3c323777a3c478e11e1ba6bf2",
+      "dccdfd52a71855cc4da18af52bda4c03", "121befbd6c246e85a34225241b8bcaf1",
+      "5780757555fd87ca1ff3f1b498a1d6e9", "6b0be2256285694b1edc0201608e1326",
+      "b7ef338c58d17f69426b5a99170c7295", "b92b84b5b3d01afac02fb9c092e84b06",
+      "e6ef7fea8b183f871c4306c4f49370c5", "c1bf95c05774d8471504e57a3efa66e4",
+      "bbacdbdafc625a139361ec22fe2cf003", "5fbbb2d6ca8fc6d07ca8d4105fda4a01",
+      "c1cbb295d9f00aa865d91a95e96f99b2", "1490e4f2c874a76ecc2bbf35dce446c3",
+      "c3bd73daaeec39895a8b64812773c93c", "6d385068ef3afbd821183d36851f709b",
+      "a34c52ef7f2fd04d1cd420238641ef48", "45d10029358c6835cf968a30605659ea",
+      "a72c1bb18cf9312c5713ce0de370743d", "df7368db2a7515a1c06a4c9dd9e32ebf",
+      "52782632271caccfa9a35ed7533e2052", "6f0ef9b62d2b9956a6464694b7a86b79",
+      "814dbc176f7201725a1cfd1cf668b4b9", "065ffbee984f4b9343c8acb0eb04fcbe",
+      "0915d76ce458d5164e3c90c1ce150795", "bf2b431d9bfa7a9925ea6f6509267ae9",
+      "d3df8c0c940a01b7bf3c3afb80b6dcd4", "15ab86216c9856a8427a51fe599258a3",
+      "2cb078484472c88e26b7401c9f11cf51", "7c5f68cc098c8adabc9e26f9cd549151",
+      "a8e47da1fcc91c2bc74d030892621576", "71af422ba2d86a401f8278591c0ef540",
+      "964c902bb4698ce82f4aa0a1edc80cd6", "78271c37d62af86576dab72ed59746b3",
+      "7247c3a7534a41137027e7d3f255f5ef", "8e529ab964f5f9d0f7c3ced98239cfc8",
+      "2481ed50bff6b36a3cac6dca2aca5ae5", "78a1ff18bf217d45f5170675dee26948",
+      "00fc534119c13aa7af4b818cad9218a2", "67501a83c93f2f9debfa86955bdffde5",
+      "2a512ef738e33a4d8476f72654deffb4", "f4eef28078bbc12de9cfb5bc2fef6238",
+      "b7ac3a35205a978bed587356155bae0e", "51ea101f09c4de2f754b61ab5aff1526",
+      "2bd689d7ec964ee8c8f6f0682f93f5ca", "eecac8dbdaa73b8b3c2234892c444147",
+      "cb7086f44ef70ef919086a3d200d8c13", "0abe35e3c796c2de1e550426b2b19441",
+      "0eb140561e1ea3843464a5247d8ecb18", "d908f7317f00daacbe3dd43495db64ad",
+      "d4d677c4b347de0a13ccab7bc16b8e6e", "26523c2c2df7f31896a3ae5aa24d5ada",
+      "0ebb9f816684769816b2ae0b1f94e3a4", "fd938d0577e3687b0a810e199f69f0bb",
+      "eb8fb832e72030e2aa214936ae0effe4", "56631887763f7daf6e1e73783e5ff656",
+      "590a25cc722c2aa4d885eede5ef09f20", "80944a218ed9b9b0374cde72914449eb",
+      "d9cbc2f1e0e56cdd6722310932db1981", "a88eb213b7a6767bbe639cda120a4ab6",
+      "9972ecbadfdf3ed0b3fedf435c5a804f", "01fdf7e22405a1b17a8d275b7451094f",
+      "6a7824e10406fade0d032e886bbc76b6", "76fefadd793ec3928e915d92782bc7e1",
+      "0fbd6b076752c9f5c926ca5c1df892ac", "aac9457239f07ad633fcd45c1465af2a",
+      "56823ef9a8e21c9c7441cc9ed870d648", "52f4c7a0b7177175302652cbc482f442",
+      "f4a4f4d7c8b93c0486cf3cbaa26fbc19",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+  static const char* const kDigest[] = {
+      "79a505b3877177197c94f0faeb0c9ec6", "cd22657d242f30c88bb83eae9efbbcce",
+      "c4c60a60976d119df3832ff6956e0181", "796bd78bf2346e8dfd61cecbf508ea0e",
+      "79e06cc6f880daf6cdb59b9b3a8efe1c", "f0643108e6b57bd566bc0d47b2dc64a1",
+      "8272a471e538ca469eaf5c997309589c", "3094741b63a29925da83dc1dc187654a",
+      "d0141df80f2335ed6051397cb2a5bc61", "33d9fd317b74f4572afbe004f991ca83",
+      "ea2413cd11bf1da93de9285381b471df", "c4f78ae2b994a3a999cb3f5dac2bb498",
+      "44804ec226453bc5f688506b56ad2a8a", "9de9c12a5f3bb8d4af13da8807dfe53f",
+      "c190dac15c08f2e591b222e1d75b60c2", "c46889b58b44d242e24b91ef531e9176",
+      "b6697e1256b60b3426a8980c7c6f9a80", "1e0eb156152fbb74b0cff41bdbdf98b5",
+      "98ab6c0abc45fd44565f84e66dc71133", "f2f2126fac1b7c0c7b7ff511c6f3c91e",
+      "0cc720e878cfa35f9b72762d08adb1bf", "6efee9ce87e098122dd05525f4c74a2f",
+      "187270514a93bd7065d2cfdb02146959", "947be7f2921b5a192d4296b2060a215c",
+      "42f02b046eda2a94133032184fdaa26d", "487e94b20867e7021dd1f10d477c3acf",
+      "9f9eac4394d8821f5c14857a28c5549b", "75d781b60c1f4aa44ceb6bc65f597a52",
+      "779f9ac3c01a86812964ccc38da2711a", "16dc8824efbd7a47808ccdbf8e37df56",
+      "e72899a8ddf6cc816e1917c25739a512", "96a4bcaedae79b55399d931fecd64312",
+      "5c5e8f4a4f0153315133e4e86a02c3a6", "d1c339b6f6cc0eabdd6674028e1f4260",
+      "4ef5868adaf6712d033dce9e51837c0b", "ed90a4ddfc463dddfe71314bc3415b4e",
+      "2312299492a47246269d6d37e67c8c0c", "56baf1c4453c5cf5ce3d6857cff4aa8f",
+      "d534ce3430377b355c3f59695cfb188b", "f40248f1a6fac4299c9645350138f598",
+      "f2e3cbbd066d9d28304667d82312d950", "e8a7784eb367b72b96486bec856b873c",
+      "02941ae2cf8272b353268a30cf9c2ee0", "8f6273a5fa62b9a4225ebdbf2ce44e27",
+      "85bb0aaba73fe8c89dcee6b5c55d5cfc", "c28c63a4e46ee2a98dd2b58379971c8c",
+      "4af35738c29d27ca9930a488bacdffe6", "34a419cc3e6ab21cf099d244169d253e",
+      "7c5b8d19ac8a81b37011fabac10143d0", "e582811e05def83270d8f65060fe8966",
+      "24662536326615a3c325409e780f65bf", "717a7f7e99d329a74391477ef3c6d738",
+      "e0f38a3dba4c6e060b6ca12a18d75fc2", "fbd0cba6a27eb06e74c5ed376187e05c",
+      "14dfb487c4a7e989629a195810b814ee", "3cf6d595317ec46e08f6eaa0f0e99b43",
+      "b3cb98c418ea854e433b612fc532bac5", "262206cee670c082361497e51cbd0f43",
+      "84c11b103a9b0a61f07493dcd269e6fd", "bd9bd9994057371252398bf52c7586f0",
+      "72e5537ba5f04fe17b7a371bd12ca0e2", "5986a20b406ceed273f9e41bc0c4c775",
+      "d5eb9ea00ce19079b49562ba4a8cb574", "3205e6f3c532a63f8d5d939fa46bc444",
+      "cfb21ac467f21954903948d4e6c9a2a1", "bd9fd6aab18bbba8096746f9ed35a640",
+      "d42ec4f13f042014c5b4af5f03d19034", "8a7fdee2b57ac641e03365625850f5d6",
+      "d18638521275b3aa9dd463d067d6a390", "a7a71c433d85576198b52608c99cab47",
+      "96e2a2443bf8cfe32d7590c5011c7523", "6fbe7cd83208937229c11a8e3be5e1e9",
+      "ecf66dac310e332a108be639171b5cf3", "327b1656c61d795c30a914f52e3d7629",
+      "157d26190bde1a6f34680708bff5d02e", "d927bba0073263a7914a4076a5edfe29",
+      "b88930ec68e5e49da8204ef21635cea2", "58e174ed0036b1ac1f5a9bdd44860222",
+      "415055dfa80c6fe7c12e4d16cac22168", "9058939bfb5998d6ecd71d87a52be893",
+      "847894efa35f1528732ec3584f62f86f", "8aa9b33c0d9695690cb4088c32f31214",
+      "11e28ab9a3192a2bc9ffd3fd0a466a13", "f246009c5efafd9310fa8e365d23cab4",
+      "2381fcd9ee0ffceba5509879d9f5709d", "1cf1dc7c7c6ecf1f3381455c99e2239e",
+      "e74601883b53791045f50bbcbbbcc803", "22926eecefa94f9f39b9bb9dbb183e5b",
+      "128c24f5a5342aebb21bdaa87907daf7", "11c39f844a2e51cc4c80ffe1afa58e70",
+      "2c0548cff2145031e304d8f97abfd751", "66e1a3daf84029341b999b18bf86e5b3",
+      "0f790f210d5366bbad7eb352b4909dd9",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct MaskBlendTestParam {
+  MaskBlendTestParam(BlockSize block_size, int subsampling_x, int subsampling_y,
+                     bool is_inter_intra, bool is_wedge_inter_intra)
+      : block_size(block_size),
+        width(kBlockWidthPixels[block_size]),
+        height(kBlockHeightPixels[block_size]),
+        subsampling_x(subsampling_x),
+        subsampling_y(subsampling_y),
+        is_inter_intra(is_inter_intra),
+        is_wedge_inter_intra(is_wedge_inter_intra) {}
+  BlockSize block_size;
+  int width;
+  int height;
+  int subsampling_x;
+  int subsampling_y;
+  bool is_inter_intra;
+  bool is_wedge_inter_intra;
+};
+
+std::ostream& operator<<(std::ostream& os, const MaskBlendTestParam& param) {
+  return os << ToString(param.block_size)
+            << ", subsampling(x/y): " << param.subsampling_x << "/"
+            << param.subsampling_y
+            << ", is_inter_intra: " << param.is_inter_intra
+            << ", is_wedge_inter_intra: " << param.is_wedge_inter_intra;
+}
+
+template <int bitdepth, typename Pixel>
+class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
+                      public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  MaskBlendTest() = default;
+  ~MaskBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    MaskBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MaskBlendInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      MaskBlendInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = (param_.is_inter_intra && !param_.is_wedge_inter_intra)
+                ? dsp->mask_blend[0][param_.is_inter_intra]
+                : dsp->mask_blend[param_.subsampling_x + param_.subsampling_y]
+                                 [param_.is_inter_intra];
+    func_8bpp_ = dsp->inter_intra_mask_blend_8bpp[param_.is_wedge_inter_intra
+                                                      ? param_.subsampling_x +
+                                                            param_.subsampling_y
+                                                      : 0];
+  }
+
+ protected:
+  int GetDigestIdOffset() const {
+    // id is for retrieving the corresponding digest from the lookup table given
+    // the set of input parameters. id can be figured out by the block size and
+    // an offset (id_offset).
+    // For example, in kMaskBlendTestParam, this set of parameters
+    // (8, 8, 0, 0, false, false) corresponds to the first entry in the
+    // digest lookup table, where id == 0.
+    // (8, 8, 1, 0, false, false) corresponds to id == 17.
+    // (8, 8, 1, 1, false, false) corresponds to id == 34.
+    // (8, 8, 0, 0, true, false) corresponds to id == 51.
+    // Id_offset denotes offset for different modes (is_inter_intra,
+    // is_wedge_inter_intra).
+    // ...
+    if (!param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+      return param_.subsampling_x * 17 + param_.subsampling_y * 17;
+    }
+    if (param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+      return 51 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+    }
+    if (param_.is_inter_intra && param_.is_wedge_inter_intra) {
+      return 72 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+    }
+    return 0;
+  }
+
+  int GetDigestId() const {
+    // Only 8x8 and larger blocks are tested.
+    int block_size_adjustment =
+        static_cast<int>(param_.block_size > kBlock16x4);
+    if (param_.is_inter_intra || param_.is_wedge_inter_intra) {
+      // 4:1/1:4 blocks are invalid for these modes.
+      block_size_adjustment += static_cast<int>(param_.block_size > kBlock8x32);
+      block_size_adjustment +=
+          static_cast<int>(param_.block_size > kBlock16x64);
+      block_size_adjustment += static_cast<int>(param_.block_size > kBlock32x8);
+      block_size_adjustment +=
+          static_cast<int>(param_.block_size > kBlock64x16);
+    }
+    return GetDigestIdOffset() + param_.block_size - kBlock8x8 -
+           block_size_adjustment;
+  }
+
+  void Test(const char* digest, int num_runs);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kStride = kMaxSuperBlockSizeInPixels;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels * sizeof(Pixel);
+  const MaskBlendTestParam param_ = GetParam();
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source1_8bpp_[kMaxSuperBlockSizeInPixels *
+                        kMaxSuperBlockSizeInPixels] = {};
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source2_8bpp_[kMaxSuperBlockSizeInPixels *
+                        kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source2_8bpp_cache_[kMaxSuperBlockSizeInPixels *
+                              kMaxSuperBlockSizeInPixels] = {};
+  uint8_t mask_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  uint8_t dest_[sizeof(Pixel) * kMaxSuperBlockSizeInPixels *
+                kMaxSuperBlockSizeInPixels] = {};
+  dsp::MaskBlendFunc func_;
+  dsp::InterIntraMaskBlendFunc8bpp func_8bpp_;
+};
+
+template <int bitdepth, typename Pixel>
+void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+                                          const int num_runs) {
+  if (func_ == nullptr && func_8bpp_ == nullptr) return;
+  const int width = param_.width >> param_.subsampling_x;
+  const int height = param_.height >> param_.subsampling_y;
+
+  // Add id offset to seed just to add more randomness to input blocks.
+  // If we use the same seed for different block sizes, the generated input
+  // blocks are repeated. For example, if input size is 8x8, the generated
+  // block is exactly the upper left half of the generated 16x16 block.
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                             GetDigestIdOffset());
+  PredType* src_1 = source1_;
+  uint8_t* src_1_8bpp = source1_8bpp_;
+  PredType* src_2 = source2_;
+  uint8_t* src_2_8bpp = source2_8bpp_;
+  const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width;
+  const ptrdiff_t mask_stride = param_.width;
+  uint8_t* mask_row = mask_;
+  const int range_mask = (1 << (bitdepth)) - 1;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      src_1[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+      src_2[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+      if (param_.is_inter_intra && bitdepth == 8) {
+        src_1_8bpp[x] = src_1[x];
+        src_2_8bpp[x] = src_2[x];
+      }
+      if (!param_.is_inter_intra) {
+        // Implies isCompound == true.
+        constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+        const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+        const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+        src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+        src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      }
+    }
+    src_1 += width;
+    src_1_8bpp += width;
+    src_2 += src_2_stride;
+    src_2_8bpp += src_2_stride;
+  }
+  // Mask should be setup regardless of subsampling.
+  for (int y = 0; y < param_.height; ++y) {
+    for (int x = 0; x < param_.width; ++x) {
+      mask_row[x] = rnd.Rand8() & 63;
+      mask_row[x] += rnd.Rand8() & 1;  // Range of mask is [0, 64].
+    }
+    mask_row += mask_stride;
+  }
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    const absl::Time start = absl::Now();
+    if (param_.is_inter_intra && bitdepth == 8) {
+      ASSERT_EQ(func_, nullptr);
+      static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), "");
+      // source2_8bpp_ is modified in the call.
+      memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_));
+      func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, mask_stride,
+                 width, height);
+      for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+          dest_[y * kDestStride + x] = source2_8bpp_[y * src_2_stride + x];
+        }
+      }
+      memcpy(source2_8bpp_, source2_8bpp_cache_, sizeof(source2_8bpp_));
+    } else {
+      if (bitdepth != 8) {
+        ASSERT_EQ(func_8bpp_, nullptr);
+      }
+      ASSERT_NE(func_, nullptr);
+      func_(source1_, source2_, src_2_stride, mask_, mask_stride, width, height,
+            dest_, kDestStride);
+    }
+    elapsed_time += absl::Now() - start;
+  }
+
+  test_utils::CheckMd5Digest("MaskBlend", ToString(param_.block_size), digest,
+                             dest_, sizeof(dest_), elapsed_time);
+}
+
+const MaskBlendTestParam kMaskBlendTestParam[] = {
+    // is_inter_intra = false, is_wedge_inter_intra = false.
+    // block size range is from 8x8 to 128x128.
+    MaskBlendTestParam(kBlock8x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x128, 0, 0, false, false),
+    MaskBlendTestParam(kBlock128x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock128x128, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x128, 1, 0, false, false),
+    MaskBlendTestParam(kBlock128x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock128x128, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock8x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock8x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x128, 1, 1, false, false),
+    MaskBlendTestParam(kBlock128x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock128x128, 1, 1, false, false),
+    // is_inter_intra = true, is_wedge_inter_intra = false.
+    // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+    // Read inter intra syntax).
+    MaskBlendTestParam(kBlock8x8, 0, 0, true, false),
+    MaskBlendTestParam(kBlock8x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x8, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x32, 0, 0, true, false),
+    MaskBlendTestParam(kBlock32x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock32x32, 0, 0, true, false),
+    MaskBlendTestParam(kBlock8x8, 1, 0, true, false),
+    MaskBlendTestParam(kBlock8x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x8, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x32, 1, 0, true, false),
+    MaskBlendTestParam(kBlock32x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock32x32, 1, 0, true, false),
+    MaskBlendTestParam(kBlock8x8, 1, 1, true, false),
+    MaskBlendTestParam(kBlock8x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x8, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x32, 1, 1, true, false),
+    MaskBlendTestParam(kBlock32x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock32x32, 1, 1, true, false),
+    // is_inter_intra = true, is_wedge_inter_intra = true.
+    // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+    // Read inter intra syntax).
+    MaskBlendTestParam(kBlock8x8, 0, 0, true, true),
+    MaskBlendTestParam(kBlock8x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x8, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x32, 0, 0, true, true),
+    MaskBlendTestParam(kBlock32x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock32x32, 0, 0, true, true),
+    MaskBlendTestParam(kBlock8x8, 1, 0, true, true),
+    MaskBlendTestParam(kBlock8x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x8, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x32, 1, 0, true, true),
+    MaskBlendTestParam(kBlock32x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock32x32, 1, 0, true, true),
+    MaskBlendTestParam(kBlock8x8, 1, 1, true, true),
+    MaskBlendTestParam(kBlock8x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x8, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x32, 1, 1, true, true),
+    MaskBlendTestParam(kBlock32x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock32x32, 1, 1, true, true),
+};
+
+using MaskBlendTest8bpp = MaskBlendTest<8, uint8_t>;
+
+TEST_P(MaskBlendTest8bpp, Blending) { Test(GetDigest8bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest8bpp, DISABLED_Speed) {
+  Test(GetDigest8bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using MaskBlendTest10bpp = MaskBlendTest<10, uint16_t>;
+
+TEST_P(MaskBlendTest10bpp, Blending) { Test(GetDigest10bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest10bpp, DISABLED_Speed) {
+  Test(GetDigest10bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using MaskBlendTest12bpp = MaskBlendTest<12, uint16_t>;
+
+TEST_P(MaskBlendTest12bpp, Blending) { Test(GetDigest12bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest12bpp, DISABLED_Speed) {
+  Test(GetDigest12bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest12bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_field_projection.cc b/src/dsp/motion_field_projection.cc
new file mode 100644 (file)
index 0000000..7c17b8e
--- /dev/null
@@ -0,0 +1,116 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when MotionFieldProjectionKernel_C is
+// not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+
+// 7.9.2.
+void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info,
+                                   int reference_to_current_with_sign,
+                                   int dst_sign, int y8_start, int y8_end,
+                                   int x8_start, int x8_end,
+                                   TemporalMotionField* motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);
+    int x8 = adjusted_x8_start;
+    do {
+      const int source_reference_type = source_reference_types[x8];
+      if (skip_references[source_reference_type]) continue;
+      MotionVector projection_mv;
+      // reference_to_current_with_sign could be 0.
+      GetMvProjection(mv[x8], reference_to_current_with_sign,
+                      projection_divisions[source_reference_type],
+                      &projection_mv);
+      // Do not update the motion vector if the block position is not valid or
+      // if position_x8 is outside the current range of x8_start and x8_end.
+      // Note that position_y8 will always be within the range of y8_start and
+      // y8_end.
+      const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+      if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+      const int x8_base = x8 & ~7;
+      const int x8_floor =
+          std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+      const int x8_ceiling =
+          std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+      const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+      if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+      dst_mv[position_y8 * stride + position_x8] = mv[x8];
+      dst_reference_offset[position_y8 * stride + position_x8] =
+          reference_offsets[source_reference_type];
+    } while (++x8 < adjusted_x8_end);
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+
+}  // namespace
+
+void MotionFieldProjectionInit_C() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C;
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_field_projection.h b/src/dsp/motion_field_projection.h
new file mode 100644 (file)
index 0000000..36de459
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+#define LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_field_projection_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_field_projection_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
diff --git a/src/dsp/motion_field_projection_test.cc b/src/dsp/motion_field_projection_test.cc
new file mode 100644 (file)
index 0000000..8a57696
--- /dev/null
@@ -0,0 +1,212 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMotionFieldWidth = 160;
+constexpr int kMotionFieldHight = 120;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionFieldProjectionTest : public testing::TestWithParam<int> {
+ public:
+  MotionFieldProjectionTest() = default;
+  MotionFieldProjectionTest(const MotionFieldProjectionTest&) = delete;
+  MotionFieldProjectionTest& operator=(const MotionFieldProjectionTest&) =
+      delete;
+  ~MotionFieldProjectionTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(8);
+    MotionFieldProjectionInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MotionFieldProjectionInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      MotionFieldProjectionInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(8);
+    ASSERT_NE(dsp, nullptr);
+    target_motion_field_projection_kernel_func_ =
+        dsp->motion_field_projection_kernel;
+  }
+
+  void SetInputData(int motion_field_width, libvpx_test::ACMRandom* rnd);
+  void TestRandomValues(bool speed);
+
+ private:
+  MotionFieldProjectionKernelFunc target_motion_field_projection_kernel_func_;
+  ReferenceInfo reference_info_;
+  TemporalMotionField motion_field_;
+};
+
+void MotionFieldProjectionTest::SetInputData(
+    const int motion_field_width, libvpx_test::ACMRandom* const rnd) {
+  ASSERT_TRUE(reference_info_.Reset(kMotionFieldHight, motion_field_width));
+  ASSERT_TRUE(motion_field_.mv.Reset(kMotionFieldHight, motion_field_width,
+                                     /*zero_initialize=*/false));
+  ASSERT_TRUE(motion_field_.reference_offset.Reset(kMotionFieldHight,
+                                                   motion_field_width,
+                                                   /*zero_initialize=*/false));
+  constexpr int order_hint_bits = 6;
+  unsigned int order_hint_shift_bits = Mod32(32 - order_hint_bits);
+  const unsigned int current_frame_order_hint =
+      rnd->Rand8() & ((1 << order_hint_bits) - 1);  // [0, 63]
+  uint8_t reference_frame_order_hint = 0;
+  reference_info_.relative_distance_to[0] = 0;
+  reference_info_.skip_references[kReferenceFrameIntra] = true;
+  reference_info_.projection_divisions[kReferenceFrameIntra] = 0;
+  for (int i = kReferenceFrameLast; i < kNumReferenceFrameTypes; ++i) {
+    reference_frame_order_hint =
+        rnd->Rand8() & ((1 << order_hint_bits) - 1);  // [0, 63]
+    const int relative_distance_to =
+        GetRelativeDistance(current_frame_order_hint,
+                            reference_frame_order_hint, order_hint_shift_bits);
+    reference_info_.relative_distance_to[i] = relative_distance_to;
+    reference_info_.skip_references[i] =
+        relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+    reference_info_.projection_divisions[i] =
+        reference_info_.skip_references[i]
+            ? 0
+            : kProjectionMvDivisionLookup[relative_distance_to];
+  }
+  for (int y = 0; y < kMotionFieldHight; ++y) {
+    for (int x = 0; x < motion_field_width; ++x) {
+      reference_info_.motion_field_reference_frame[y][x] =
+          static_cast<ReferenceFrameType>(rnd->Rand16() &
+                                          kReferenceFrameAlternate);
+      reference_info_.motion_field_mv[y][x].mv[0] = rnd->Rand16Signed() / 512;
+      reference_info_.motion_field_mv[y][x].mv[1] = rnd->Rand16Signed() / 512;
+    }
+  }
+  MotionVector invalid_mv;
+  invalid_mv.mv[0] = kInvalidMvValue;
+  invalid_mv.mv[1] = kInvalidMvValue;
+  MotionVector* const motion_field_mv = &motion_field_.mv[0][0];
+  int8_t* const motion_field_reference_offset =
+      &motion_field_.reference_offset[0][0];
+  std::fill(motion_field_mv, motion_field_mv + motion_field_.mv.size(),
+            invalid_mv);
+  std::fill(
+      motion_field_reference_offset,
+      motion_field_reference_offset + motion_field_.reference_offset.size(),
+      -128);
+}
+
+void MotionFieldProjectionTest::TestRandomValues(bool speed) {
+  static const char* const kDigestMv[8] = {
+      "87c2a74538f5c015809492ac2e521075", "ba7b4a5d82c6083b13a5b02eb7655ab7",
+      "8c37d96bf1744d5553860bf44a4f60a3", "720aa644f85e48995db9785e87cd02e3",
+      "9289c0c66524bb77a605870d78285f35", "f0326509885c2b2c89feeac53698cd47",
+      "6b9ad1d672dec825cb1803063d35badc", "dfe06c57cc9c70d27246df7fd0afa0b2"};
+  static const char* const kDigestReferenceOffset[8] = {
+      "d8d1384268d7cf5c4514b39c329f94fb", "7f30e79ceb064befbad64a20d206a540",
+      "61e2eb5644edbd3a91b939403edc891e", "7a018f1bf88193e86934241af445dc36",
+      "2d6166bf8bbe1db77baf687ecf71d028", "95fee61f0219e06076d6f0e1073b1a4e",
+      "64d0a63751267bdc573cab761f1fe685", "906a99e0e791dbcb9183c9b68ecc4ea3"};
+  const int num_tests = speed ? 2000 : 1;
+  if (target_motion_field_projection_kernel_func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  for (int width_idx = 0; width_idx < 8; ++width_idx) {
+    const int motion_field_width = kMotionFieldWidth + width_idx;
+    SetInputData(motion_field_width, &rnd);
+    const int dst_sign = ((rnd.Rand16() & 1) != 0) ? 0 : -1;
+    const int reference_to_current_with_sign =
+        rnd.PseudoUniform(2 * kMaxFrameDistance + 1) - kMaxFrameDistance;
+    assert(std::abs(reference_to_current_with_sign) <= kMaxFrameDistance);
+    // Step of y8 and x8 is at least 16 except the last hop.
+    for (int step = 16; step <= 80; step += 16) {
+      const absl::Time start = absl::Now();
+      for (int k = 0; k < num_tests; ++k) {
+        for (int y8 = 0; y8 < kMotionFieldHight; y8 += step) {
+          const int y8_end = std::min(y8 + step, kMotionFieldHight);
+          for (int x8 = 0; x8 < motion_field_width; x8 += step) {
+            const int x8_end = std::min(x8 + step, motion_field_width);
+            target_motion_field_projection_kernel_func_(
+                reference_info_, reference_to_current_with_sign, dst_sign, y8,
+                y8_end, x8, x8_end, &motion_field_);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "MotionFieldProjectionKernel",
+          absl::StrFormat("(mv) width %d  step %d", motion_field_width, step)
+              .c_str(),
+          kDigestMv[width_idx], motion_field_.mv[0],
+          sizeof(motion_field_.mv[0][0]) * motion_field_.mv.size(),
+          elapsed_time);
+      test_utils::CheckMd5Digest(
+          "MotionFieldProjectionKernel",
+          absl::StrFormat("(ref offset) width %d  step %d", motion_field_width,
+                          step)
+              .c_str(),
+          kDigestReferenceOffset[width_idx], motion_field_.reference_offset[0],
+          sizeof(motion_field_.reference_offset[0][0]) *
+              motion_field_.reference_offset.size(),
+          elapsed_time);
+    }
+  }
+}
+
+TEST_P(MotionFieldProjectionTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionFieldProjectionTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionFieldProjectionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_vector_search.cc b/src/dsp/motion_vector_search.cc
new file mode 100644 (file)
index 0000000..205a1b6
--- /dev/null
@@ -0,0 +1,187 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when the C functions are not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+
+void MvProjectionCompoundLowPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+        for (auto& mv : candidate_mvs[index].mv[i].mv) {
+          // The next line is equivalent to:
+          // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+          mv = (mv - (mv >> 15)) & ~1;
+        }
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionCompoundForceInteger_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+        for (auto& mv : candidate_mvs[index].mv[i].mv) {
+          // The next line is equivalent to:
+          // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+          // const int sign = mv >> 15;
+          // mv = ApplySign(value, sign);
+          mv = (mv + 3 - (mv >> 15)) & ~7;
+        }
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionCompoundHighPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleLowPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+    for (auto& mv : candidate_mvs[index].mv) {
+      // The next line is equivalent to:
+      // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+      mv = (mv - (mv >> 15)) & ~1;
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleForceInteger_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+    for (auto& mv : candidate_mvs[index].mv) {
+      // The next line is equivalent to:
+      // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+      // const int sign = mv >> 15;
+      // mv = ApplySign(value, sign);
+      mv = (mv + 3 - (mv >> 15)) & ~7;
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleHighPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+  } while (++index < count);
+}
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+
+}  // namespace
+
+void MotionVectorSearchInit_C() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C;
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_vector_search.h b/src/dsp/motion_vector_search.h
new file mode 100644 (file)
index 0000000..ae16726
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+#define LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_vector_search_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_vector_search_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
diff --git a/src/dsp/motion_vector_search_test.cc b/src/dsp/motion_vector_search_test.cc
new file mode 100644 (file)
index 0000000..5c680d6
--- /dev/null
@@ -0,0 +1,196 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionVectorSearchTest : public testing::TestWithParam<int>,
+                               public test_utils::MaxAlignedAllocable {
+ public:
+  MotionVectorSearchTest() = default;
+  MotionVectorSearchTest(const MotionVectorSearchTest&) = delete;
+  MotionVectorSearchTest& operator=(const MotionVectorSearchTest&) = delete;
+  ~MotionVectorSearchTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(8);
+    MotionVectorSearchInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MotionVectorSearchInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      MotionVectorSearchInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(8);
+    ASSERT_NE(dsp, nullptr);
+    mv_projection_compound_[0] = dsp->mv_projection_compound[0];
+    mv_projection_compound_[1] = dsp->mv_projection_compound[1];
+    mv_projection_compound_[2] = dsp->mv_projection_compound[2];
+    mv_projection_single_[0] = dsp->mv_projection_single[0];
+    mv_projection_single_[1] = dsp->mv_projection_single[1];
+    mv_projection_single_[2] = dsp->mv_projection_single[2];
+  }
+
+  void SetInputData(libvpx_test::ACMRandom* rnd);
+  void TestRandomValues(bool speed);
+
+ private:
+  MvProjectionCompoundFunc mv_projection_compound_[3];
+  MvProjectionSingleFunc mv_projection_single_[3];
+  int reference_offsets_[2];
+  alignas(kMaxAlignment)
+      MotionVector temporal_mvs_[kMaxTemporalMvCandidatesWithPadding];
+  int8_t temporal_reference_offsets_[kMaxTemporalMvCandidatesWithPadding];
+  CompoundMotionVector compound_mv_org_[kMaxTemporalMvCandidates + 1]
+                                       [kMaxTemporalMvCandidatesWithPadding];
+  alignas(kMaxAlignment)
+      CompoundMotionVector compound_mv_[kMaxTemporalMvCandidates + 1]
+                                       [kMaxTemporalMvCandidatesWithPadding];
+  MotionVector single_mv_org_[kMaxTemporalMvCandidates + 1]
+                             [kMaxTemporalMvCandidatesWithPadding];
+  alignas(kMaxAlignment)
+      MotionVector single_mv_[kMaxTemporalMvCandidates + 1]
+                             [kMaxTemporalMvCandidatesWithPadding];
+};
+
+void MotionVectorSearchTest::SetInputData(libvpx_test::ACMRandom* const rnd) {
+  reference_offsets_[0] =
+      Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+  reference_offsets_[1] =
+      Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+  for (int i = 0; i < kMaxTemporalMvCandidatesWithPadding; ++i) {
+    temporal_reference_offsets_[i] = rnd->RandRange(kMaxFrameDistance);
+    for (auto& mv : temporal_mvs_[i].mv) {
+      mv = rnd->Rand16Signed() / 8;
+    }
+  }
+  for (int i = 0; i <= kMaxTemporalMvCandidates; ++i) {
+    for (int j = 0; j < kMaxTemporalMvCandidatesWithPadding; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        single_mv_[i][j].mv[k] = rnd->Rand16Signed();
+        for (auto& mv : compound_mv_[i][j].mv[k].mv) {
+          mv = rnd->Rand16Signed();
+        }
+      }
+      compound_mv_org_[i][j] = compound_mv_[i][j];
+      single_mv_org_[i][j] = single_mv_[i][j];
+    }
+  }
+}
+
+void MotionVectorSearchTest::TestRandomValues(bool speed) {
+  static const char* const kDigestCompound[3] = {
+      "74c055b06c3701b2e50f2c964a6130b9", "cab21dd54f0a1bf6e80b58cdcf1fe0a9",
+      "e42de30cd84fa4e7b8581a330ed08a8b"};
+  static const char* const kDigestSingle[3] = {
+      "265ffbb59d0895183f8e2d90b6652c71", "5068d980c4ce42ed3f11963b8aece6cc",
+      "7e699d58df3954a38ff11c8e34151e66"};
+  const int num_tests = speed ? 1000000 : 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  for (int function_index = 0; function_index < 3; ++function_index) {
+    SetInputData(&rnd);
+    if (mv_projection_compound_[function_index] == nullptr) continue;
+    const absl::Time start = absl::Now();
+    for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+      const int total_count = count + (count & 1);
+      for (int i = 0; i < num_tests; ++i) {
+        mv_projection_compound_[function_index](
+            temporal_mvs_, temporal_reference_offsets_, reference_offsets_,
+            count, compound_mv_[count]);
+      }
+      // One more element could be calculated in SIMD implementations.
+      // Restore the original values if any.
+      for (int i = count; i < total_count; ++i) {
+        compound_mv_[count][i] = compound_mv_org_[count][i];
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "MvProjectionCompound",
+        absl::StrFormat("function_index %d", function_index).c_str(),
+        kDigestCompound[function_index], compound_mv_, sizeof(compound_mv_),
+        elapsed_time);
+  }
+  for (int function_index = 0; function_index < 3; ++function_index) {
+    SetInputData(&rnd);
+    if (mv_projection_single_[function_index] == nullptr) continue;
+    const absl::Time start = absl::Now();
+    for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+      const int total_count = (count + 3) & ~3;
+      for (int i = 0; i < num_tests; ++i) {
+        mv_projection_single_[function_index](
+            temporal_mvs_, temporal_reference_offsets_, reference_offsets_[0],
+            count, single_mv_[count]);
+      }
+      // Up to three more elements could be calculated in SIMD implementations.
+      // Restore the original values if any.
+      for (int i = count; i < total_count; ++i) {
+        single_mv_[count][i] = single_mv_org_[count][i];
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "MvProjectionSingle",
+        absl::StrFormat("function_index %d", function_index).c_str(),
+        kDigestSingle[function_index], single_mv_, sizeof(single_mv_),
+        elapsed_time);
+  }
+}
+
+TEST_P(MotionVectorSearchTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionVectorSearchTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionVectorSearchTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc
new file mode 100644 (file)
index 0000000..479cb1d
--- /dev/null
@@ -0,0 +1,155 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+// 7.11.3.10 (from top samples).
+template <typename Pixel>
+void OverlapBlendVertical_C(void* LIBGAV1_RESTRICT const prediction,
+                            const ptrdiff_t prediction_stride, const int width,
+                            const int height,
+                            const void* LIBGAV1_RESTRICT const obmc_prediction,
+                            const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<Pixel*>(prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+  const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+  const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+  const uint8_t* const mask = kObmcMask + height - 2;
+  assert(width >= 4);
+  assert(height >= 2);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8_t mask_value = mask[y];
+    for (int x = 0; x < width; ++x) {
+      pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+          mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+    }
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  }
+}
+
+// 7.11.3.10 (from left samples).
+template <typename Pixel>
+void OverlapBlendHorizontal_C(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<Pixel*>(prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+  const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+  const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+  const uint8_t* const mask = kObmcMask + width - 2;
+  assert(width >= 2);
+  assert(height >= 4);
+
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value = mask[x];
+      pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+          mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+    }
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void ObmcInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/obmc.h b/src/dsp/obmc.h
new file mode 100644 (file)
index 0000000..3b826c7
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_OBMC_H_
+#define LIBGAV1_SRC_DSP_OBMC_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/obmc_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/obmc_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_OBMC_H_
diff --git a/src/dsp/obmc.inc b/src/dsp/obmc.inc
new file mode 100644 (file)
index 0000000..001c6ee
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for overlap blend implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2.
+constexpr uint8_t kObmcMask[62] = {
+    // Obmc Mask 2
+    45, 64,
+    // Obmc Mask 4
+    39, 50, 59, 64,
+    // Obmc Mask 8
+    36, 42, 48, 53, 57, 61, 64, 64,
+    // Obmc Mask 16
+    34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+    // Obmc Mask 32
+    33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+    59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc
new file mode 100644 (file)
index 0000000..289fd66
--- /dev/null
@@ -0,0 +1,414 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kMaxBlendingBlockSize = 64;
+constexpr int kNumSpeedTests = 2e8;
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "c8659acd1e8ecdab06be73f0954fa1ae", "e785f31f2723a193fefd534bd6f6c18f",
+      "751fcd8a345fef1c38a25293c9b528c0", "69af412dfa5e96ad43b79c178cb1c58b",
+      "2766a64622e183bb4614f2018f14fa85", "8d98589a5cef6e68ee8fadf19d420e3c",
+      "19eccf31dd8cf1abcee9414128fe4141", "35019f98e30bcbc6ab624682a0628519",
+      "199c551164e73c100045d7ab033ffdcc", "ad5a5eb2906265690c22741b0715f37b",
+      "e2152dea159249149ff4151111b73ed6", "1edd570bec7e63780d83588f6aacda25",
+      "b24ad192e151b1e0f74d1493004cb1b6", "6c1ce7ed3463cc60870e336f990d4f14",
+      "2e6b7a06da21512dfdd9a517d2988655", "971ba1c41ab13bb341c04f936760f546",
+      "55b803239d9f12888c666c5320450937", "3d0838963f8c95dafbfb8e5e25c865d2",
+      "98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548",
+      "33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetDigestSpeed8bpp(int id) {
+  static const char* const kDigest[] = {
+      "5ea519b616cd2998fbb9b25b4c2660cb", "f23d18197a96de48901738d130a147d9",
+      "07b4140c693947a63865f835089766c4", "62547d29bc4dfb2e201e9d907c09e345",
+      "c3988da521be50aeb9944564001b282b", "d5a8ff9ca1bd49f4260bb497c489b06c",
+      "b3e94f1e33c316759ebf47620327168c", "c5e64a34ca7e55f4daed19cbe4c27049",
+      "3b234eb729e8e79db8692c4cbe1b6667", "f9f3060a44c3a575470f9700b3c3a75b",
+      "e3a1960b0a7238db1184a3f9d8e9a4b2", "ba9938553703d520bc0ade427c397140",
+      "31bf64a6ed1e8002d488c0b9dcffb80a", "9ab1f3ae2e7f70cd27452f30cecfd18e",
+      "eaf25ac79ad70fc17ca96d8fcdf0f939", "9aaa88cb5e6b8757e37c3430bd664e70",
+      "8293874b2794df8fd22f5a35c3de7bee", "e9d6ee9106227c2c67ea9e6a4652e4ad",
+      "29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5",
+      "a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "6f922e4142b644ca3f1eb0f363a1c34e", "84e7c098a9335b36082fec0bc7203075",
+      "40f00ea6884fea23a3b7fae59e3b02c3", "70cb92d08b4fdb6dd9c7d418cb1455d3",
+      "ed550798b56e70439a93cb48c359e873", "55e0d927b984e78cd51a1961e58a431d",
+      "482a6856b87265a82e4ea3fdadb2d95b", "0be46226ff87d74ff2ce68a83eaf9cca",
+      "bb4461f0131a1693a0a76f21d92a480b", "ea24f78d74c7864fb247c9a98c9b97b6",
+      "d2e70b81882aeb3d9fccef89e7552a9d", "f5d882ee6d9ae6f7dfa467ca99301424",
+      "824ddb98eb4129b3d254c0bc7a64cd73", "5eaaafa8ef9b7ba5e2856a947e5b33df",
+      "071de1494e0f1b2f99266b90bdc43ddd", "c33227a96dad506adc32dacfb371ab78",
+      "e8a632f9fff240c439d4ae6e86795046", "26b90d74f18f9df4427b6180d48db1fc",
+      "e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a",
+      "b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetDigestSpeed10bpp(int id) {
+  static const char* const kDigest[] = {
+      "80557576299708005111029cef04da53", "24f84f07f53f61cd46bdcfe1e05ff9b5",
+      "4dd6bc62145baa5357a4cbf6d7a6ef15", "0b7aa27cee43b8ae0c02d07887eaa225",
+      "9e28cdae73ca97433499c31ca79e1d07", "1cacd6466a143f88e736fffaf21e2246",
+      "9c7699626660d8965e06a54282a408f3", "eef893efef62b2eb4aaad06fc462819c",
+      "4965d0a3ff750813df85c0082b21bd4b", "ec10fd79fbf552abc595def392e9a863",
+      "a148bbafdc4466fbb700b31acccca8ac", "5da9d960988549f53b817003b93e4d01",
+      "b4c4f88d1fb54869ce7ff452ca7786a6", "d607f785fce62bad85102054539e7089",
+      "b441761ea2817e4618c594aaa11d670a", "1cc5e08e6d5f9315dbc0369b97af941d",
+      "568cc1a3a67ba4e6e77f54602d0ed3e3", "522f14c068f788bc284a7d1e47d623ed",
+      "b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac",
+      "13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+  static const char* const kDigest[] = {
+      "eb18c776d7b56280f01cca40b04a9c44", "058d4a6ed025eac5dcf7aec3203c0882",
+      "8355884d7470e9c6af9309ab23bee859", "2ba330551ac58d1d034b947d7ab9b59f",
+      "0d25cd773c81e4c57f82513e3b031f01", "b9075f7c3b9a240dbb015a24454eeb71",
+      "563ed8683723d1e4f2746280bca3db0a", "d7125306bd8c952d0f85fe1515ca16a7",
+      "5bf99c7e4a918c9b6a7e251484ea6527", "38ac9c685e8d2bd2771b6f2b38268301",
+      "abc39dbde7470e08b15417ee97c704b2", "37e12753d23b7a8df92b1d32f3170d9f",
+      "9a609776cfa31f64826225d0a6b7afdd", "ccdd89e70e94f751fd891b124c1c3210",
+      "2bbf7b095e26ed4f27e7d05e20117084", "9a1b403c3a7c00da5686bcb87f1270e8",
+      "701d651e391043ab8ebbd0023a430980", "0047f10bdd8321494e8e82597fe2f969",
+      "f97e662d139b2811e3d3227de95135a2", "852933b90d4a70f9254157381ed641e0",
+      "cfcda707ec8e4361ef741dc716888348", "95e34eab83b3159f61685db248c6a881",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetDigestSpeed12bpp(int id) {
+  static const char* const kDigest[] = {
+      "6c0f37c41d72ce40d95545ac0f08d88a", "8a8efeb7d8b2f852d76d0176b6c6878f",
+      "5757c88d1cdc0cd29c47c346474161f0", "fef8cf06d16ba7357bfc061e43080cd3",
+      "6bd11582448532bce8b91cc8807ab6a0", "1e6dd42eada2d636e210f4e20a771102",
+      "377a0472f45fcb42f1712243ea845530", "e3760f2b6e69c1b40e71ecde711d227c",
+      "6721638d1a5dadb96ddd0ca067c737ca", "3d3a23210a8496a76991bcec5045808b",
+      "2cbd26ecf7d4e927ab569083d3ddb4ca", "7d61af2d7841d1a39a2e930bac166804",
+      "dd929506442fb1f2e67130fe8cdf487b", "c0e57f8d2546d5bcb646a24d09d83d7c",
+      "2989c6487456c92eb003c8e17e904f45", "5cfb60a3be6ee5c41e0f655a3020f687",
+      "28f37d47cb07aa382659ff556a55a4c6", "b6478ab317b11f592deb60d02ce62f2f",
+      "bc78e7250c101f82e794d4fa0ee55025", "24304ed23d336a46f205206d3c5d48ef",
+      "dc1e71d95d06c1086bb7f9e05e38bf39", "32606ef72985e7de608df2e8760784b7",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct ObmcTestParam {
+  ObmcTestParam(int width, int height, ObmcDirection blending_direction)
+      : width(width), height(height), blending_direction(blending_direction) {}
+  int width;
+  int height;
+  ObmcDirection blending_direction;
+};
+
+std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height
+            << ", blending_direction: " << ToString(param.blending_direction);
+}
+
+template <int bitdepth, typename Pixel>
+class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  ObmcBlendTest() = default;
+  ~ObmcBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    ObmcInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      ObmcInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ObmcInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->obmc_blend[blending_direction_];
+  }
+
+ protected:
+  int GetDigestId() const {
+    // blending_direction_ == kObmcDirectionVertical:
+    // (width, height):
+    // (4, 2), id = 0. (4, 4), id = 1. (4, 8), id = 2. (8, 4), id = 3.
+    // ...
+    // blending_direction_ == kObmcDirectionHorizontal: id starts from 11.
+    // Vertical skips (2, 4) while horizontal skips (4, 2) creating a gap after
+    // (2, 4).
+    const int id = (blending_direction_ == kObmcDirectionVertical) ? 0
+                   : (width_ == 2)                                 ? 12
+                                                                   : 11;
+    if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1) - 2;
+    if (width_ < height_) return id + 3 * (FloorLog2(width_) - 1) - 1;
+    return id + 3 * (FloorLog2(height_) - 1);
+  }
+
+  // Note |digest| is only used when |use_fixed_values| is false.
+  void Test(const char* digest, bool use_fixed_values, int value);
+  void TestSpeed(const char* digest, int num_runs);
+
+ private:
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  const ObmcDirection blending_direction_ = GetParam().blending_direction;
+  Pixel source1_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+  Pixel source2_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+  dsp::ObmcBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+                                          const bool use_fixed_values,
+                                          const int value) {
+  if (func_ == nullptr) return;
+  if (use_fixed_values) {
+    std::fill(source1_,
+              source1_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+    std::fill(source2_,
+              source2_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    Pixel* src_1 = source1_;
+    Pixel* src_2 = source2_;
+    const int mask = (1 << bitdepth) - 1;
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        src_1[x] = rnd.Rand16() & mask;
+        src_2[x] = rnd.Rand16() & mask;
+      }
+      src_1 += kMaxBlendingBlockSize;
+      src_2 += width_;
+    }
+  }
+  const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+  func_(source1_, stride, width_, height_, source2_,
+        width_ * sizeof(source2_[0]));
+  if (use_fixed_values) {
+    const bool success = test_utils::CompareBlocks(
+        source1_, source2_, width_, height_, kMaxBlendingBlockSize,
+        kMaxBlendingBlockSize, false);
+    EXPECT_TRUE(success);
+  } else {
+    test_utils::CheckMd5Digest(
+        ToString(blending_direction_),
+        absl::StrFormat("%dx%d", width_, height_).c_str(), digest, source1_,
+        sizeof(source1_), absl::Duration());
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+                                               const int num_runs) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  Pixel* src_1 = source1_;
+  Pixel* src_2 = source2_;
+  const int mask = (1 << bitdepth) - 1;
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      src_1[x] = rnd.Rand16() & mask;
+      src_2[x] = rnd.Rand16() & mask;
+    }
+    src_1 += kMaxBlendingBlockSize;
+    src_2 += width_;
+  }
+  const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+  uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize];
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    memcpy(dest, source1_,
+           sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+    const absl::Time start = absl::Now();
+    func_(dest, stride, width_, height_, source2_,
+          width_ * sizeof(source2_[0]));
+    elapsed_time += absl::Now() - start;
+  }
+  memcpy(source1_, dest,
+         sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+  test_utils::CheckMd5Digest(ToString(blending_direction_),
+                             absl::StrFormat("%dx%d", width_, height_).c_str(),
+                             digest, source1_, sizeof(source1_), elapsed_time);
+}
+
+const ObmcTestParam kObmcTestParam[] = {
+    ObmcTestParam(4, 2, kObmcDirectionVertical),
+    ObmcTestParam(4, 4, kObmcDirectionVertical),
+    ObmcTestParam(4, 8, kObmcDirectionVertical),
+    ObmcTestParam(8, 4, kObmcDirectionVertical),
+    ObmcTestParam(8, 8, kObmcDirectionVertical),
+    ObmcTestParam(8, 16, kObmcDirectionVertical),
+    ObmcTestParam(16, 8, kObmcDirectionVertical),
+    ObmcTestParam(16, 16, kObmcDirectionVertical),
+    ObmcTestParam(16, 32, kObmcDirectionVertical),
+    ObmcTestParam(32, 16, kObmcDirectionVertical),
+    ObmcTestParam(32, 32, kObmcDirectionVertical),
+    ObmcTestParam(2, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(4, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(4, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 32, kObmcDirectionHorizontal),
+    ObmcTestParam(32, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(32, 32, kObmcDirectionHorizontal),
+};
+
+using ObmcBlendTest8bpp = ObmcBlendTest<8, uint8_t>;
+
+TEST_P(ObmcBlendTest8bpp, Blending) {
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 255);
+  Test(GetDigest8bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest8bpp, DISABLED_Speed) {
+  TestSpeed(GetDigestSpeed8bpp(GetDigestId()),
+            kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ObmcBlendTest10bpp = ObmcBlendTest<10, uint16_t>;
+
+TEST_P(ObmcBlendTest10bpp, Blending) {
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 10) - 1);
+  Test(GetDigest10bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest10bpp, DISABLED_Speed) {
+  TestSpeed(GetDigestSpeed10bpp(GetDigestId()),
+            kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ObmcBlendTest12bpp = ObmcBlendTest<12, uint16_t>;
+
+TEST_P(ObmcBlendTest12bpp, Blending) {
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 12) - 1);
+  Test(GetDigest12bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest12bpp, DISABLED_Speed) {
+  TestSpeed(GetDigestSpeed12bpp(GetDigestId()),
+            kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest12bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/smooth_weights.inc b/src/dsp/smooth_weights.inc
new file mode 100644 (file)
index 0000000..d4ee8a6
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Each row below contains weights used for a corresponding block size. Because
+// they are adjacent powers of 2, the index of each row is the sum of the sizes
+// of preceding rows, minus 4.
+// The weights need to be declared as uint8_t or uint16_t, depending on the
+// bitdepth, so the values are held in a single canonical place.
+// clang-format off
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+    // clang-format on
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
new file mode 100644 (file)
index 0000000..7593729
--- /dev/null
@@ -0,0 +1,129 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cassert>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void SuperRes_C(const void* /*coefficients*/,
+                void* LIBGAV1_RESTRICT const source,
+                const ptrdiff_t source_stride, const int height,
+                const int downscaled_width, const int upscaled_width,
+                const int initial_subpixel_x, const int step,
+                void* LIBGAV1_RESTRICT const dest, ptrdiff_t dest_stride) {
+  assert(step <= 1 << kSuperResScaleBits);
+  auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<Pixel*>(dest);
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                      kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    // If (original) upscaled_width is <= 9, the downscaled_width may be
+    // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
+    // subsampled via RightShiftWithRounding. This leads to an edge case where
+    // |step| == 1 << 14.
+    int subpixel_x = initial_subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
+      const int src_x_subpixel =
+          (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
+      // The sign of each tap is: - + - + + - + -
+      sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+      sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+      sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+      sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+      sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+      sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+      sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+      sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
+      dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
+                     (1 << bitdepth) - 1);
+      subpixel_x += step;
+    } while (++x < upscaled_width);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+  dsp->super_res = SuperRes_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+  dsp->super_res = SuperRes_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_SuperRes
+  dsp->super_res = SuperRes_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void SuperResInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/super_res.h b/src/dsp/super_res.h
new file mode 100644 (file)
index 0000000..2ca9d2b
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_SUPER_RES_H_
+#define LIBGAV1_SRC_DSP_SUPER_RES_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/super_res_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/super_res_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_SUPER_RES_H_
diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc
new file mode 100644 (file)
index 0000000..0c3537c
--- /dev/null
@@ -0,0 +1,303 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e5;
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigestSuperRes[] = {
+      "52eb4eac1df0c51599d57696405b69d0", "ccb07cc8295fd1440ff2e3b9199ec4f9",
+      "baef34cca795b95f3d1fd81d609da679", "03f1579c2773c8ba9c867316a22b94a3"};
+  return kDigestSuperRes[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigestSuperRes[] = {
+      "8fd78e05d944aeb11fac278b47ee60ba", "948eaecb70fa5614ce1c1c95e9942dc3",
+      "126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"};
+  return kDigestSuperRes[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+  static const char* const kDigestSuperRes[] = {
+      "9a08983d82df4983700976f18919201b", "6e5edbafcb6c38db37258bf79c00ea32",
+      "f5c57e6d3b518f9585f768ed19b91568", "b5de9b93c8a1a50580e7c7c9456fb615"};
+  return kDigestSuperRes[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct SuperResTestParam {
+  SuperResTestParam(int downscaled_width, int upscaled_width)
+      : downscaled_width(downscaled_width), upscaled_width(upscaled_width) {}
+  int downscaled_width;
+  int upscaled_width;
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+class SuperResTest : public testing::TestWithParam<SuperResTestParam>,
+                     public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  SuperResTest() = default;
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    SuperResInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const std::vector<std::string> split_test_name =
+        absl::StrSplit(test_info->name(), '/');
+    ASSERT_TRUE(absl::SimpleAtoi(split_test_name[1], &test_id_));
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      SuperResInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      SuperResInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    super_res_coefficients_ = dsp->super_res_coefficients;
+    func_ = dsp->super_res;
+  }
+
+  void TestComputeSuperRes(int fixed_value, int num_runs);
+
+ private:
+  static constexpr int kHeight = 127;
+  // The maximum width that must be allocated.
+  static constexpr int kUpscaledBufferWidth = 192;
+  // Allow room for the filter taps.
+  static constexpr int kStride =
+      ((kUpscaledBufferWidth + 2 * kSuperResHorizontalBorder + 15) & ~15);
+  const int kDownscaledWidth = GetParam().downscaled_width;
+  const int kUpscaledWidth = GetParam().upscaled_width;
+  int test_id_;
+  SuperResCoefficientsFunc super_res_coefficients_;
+  SuperResFunc func_;
+  Pixel source_buffer_[kHeight][kStride];
+  alignas(kMaxAlignment) Pixel dest_buffer_[kHeight][kStride];
+  alignas(kMaxAlignment) Coefficient
+      superres_coefficients_[kSuperResFilterTaps * kUpscaledBufferWidth];
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes(
+    int fixed_value, int num_runs) {
+  if (func_ == nullptr) return;
+  const int superres_width = kDownscaledWidth << kSuperResScaleBits;
+  const int step = (superres_width + kUpscaledWidth / 2) / kUpscaledWidth;
+  const int error = step * kUpscaledWidth - superres_width;
+  const int initial_subpixel_x =
+      ((-((kUpscaledWidth - kDownscaledWidth) << (kSuperResScaleBits - 1)) +
+        DivideBy2(kUpscaledWidth)) /
+           kUpscaledWidth +
+       (1 << (kSuperResExtraBits - 1)) - error / 2) &
+      kSuperResScaleMask;
+  if (super_res_coefficients_ != nullptr) {
+    super_res_coefficients_(kUpscaledWidth, initial_subpixel_x, step,
+                            superres_coefficients_);
+  }
+  memset(dest_buffer_, 0, sizeof(dest_buffer_));
+  if (fixed_value != 0) {
+    SetBlock<Pixel>(kHeight, kStride, fixed_value, source_buffer_[0], kStride);
+  } else {
+    // Random values.
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int bitdepth_mask = (1 << bitdepth) - 1;
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kStride; ++x) {
+        source_buffer_[y][x] = rnd.Rand16() & bitdepth_mask;
+      }
+    }
+  }
+  // Offset starting point in the buffer to accommodate line extension.
+  Pixel* src_ptr = source_buffer_[0] + kSuperResHorizontalBorder;
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    func_(superres_coefficients_, src_ptr, kStride, kHeight, kDownscaledWidth,
+          kUpscaledWidth, initial_subpixel_x, step, dest_buffer_, kStride);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+
+  if (fixed_value != 0) {
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kUpscaledWidth; ++x) {
+        EXPECT_TRUE(dest_buffer_[y][x] == fixed_value)
+            << "At location [" << y << ", " << x
+            << "]\nexpected: " << fixed_value
+            << "\nactual: " << dest_buffer_[y][x];
+      }
+    }
+  } else if (num_runs == 1) {
+    // Random values.
+    if ((kUpscaledWidth & 15) != 0) {
+      // The SIMD functions overwrite up to 15 pixels in each row. Reset them.
+      for (int y = 0; y < kHeight; ++y) {
+        for (int x = kUpscaledWidth; x < Align(kUpscaledWidth, 16); ++x) {
+          dest_buffer_[y][x] = 0;
+        }
+      }
+    }
+    const char* expected_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        expected_digest = GetDigest8bpp(test_id_);
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        expected_digest = GetDigest10bpp(test_id_);
+        break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+      case 12:
+        expected_digest = GetDigest12bpp(test_id_);
+        break;
+#endif
+    }
+    ASSERT_NE(expected_digest, nullptr);
+    test_utils::CheckMd5Digest(
+        "SuperRes",
+        absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step,
+                        initial_subpixel_x)
+            .c_str(),
+        expected_digest, dest_buffer_, sizeof(dest_buffer_), elapsed_time);
+  } else {
+    // Speed test.
+    printf("Mode SuperRes [width %d, step %d, start %d]: %d us\n",
+           kUpscaledWidth, step, initial_subpixel_x,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+}
+
+using SuperResTest8bpp = SuperResTest<8, uint8_t, int8_t>;
+
+TEST_P(SuperResTest8bpp, FixedValues) {
+  TestComputeSuperRes(100, 1);
+  TestComputeSuperRes(255, 1);
+  TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest8bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest8bpp, DISABLED_Speed) {
+  TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+const SuperResTestParam kSuperResTestParams[] = {
+    SuperResTestParam(96, 192),
+    SuperResTestParam(171, 192),
+    SuperResTestParam(102, 128),
+    SuperResTestParam(61, 121),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SuperResTest10bpp = SuperResTest<10, uint16_t, int16_t>;
+
+TEST_P(SuperResTest10bpp, FixedValues) {
+  TestComputeSuperRes(100, 1);
+  TestComputeSuperRes(511, 1);
+  TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest10bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest10bpp, DISABLED_Speed) {
+  TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using SuperResTest12bpp = SuperResTest<12, uint16_t, int16_t>;
+
+TEST_P(SuperResTest12bpp, FixedValues) {
+  TestComputeSuperRes(100, 1);
+  TestComputeSuperRes(2047, 1);
+  TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest12bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest12bpp, DISABLED_Speed) {
+  TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest12bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc
new file mode 100644 (file)
index 0000000..f62f1ed
--- /dev/null
@@ -0,0 +1,496 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// Warp prediction output ranges from WarpTest.ShowRange.
+// Bitdepth:  8 Input range:            [       0,      255]
+//   8bpp intermediate offset: 16384.
+//   intermediate range:                [    4399,    61009]
+//   first pass output range:           [     550,     7626]
+//   8bpp intermediate offset removal: 262144.
+//   intermediate range:                [ -620566,  1072406]
+//   second pass output range:          [       0,      255]
+//   compound second pass output range: [   -4848,     8378]
+//
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   intermediate range:                [  -48081,   179025]
+//   first pass output range:           [   -6010,    22378]
+//   intermediate range:                [-2103516,  4198620]
+//   second pass output range:          [       0,     1023]
+//   compound second pass output range: [    8142,    57378]
+//
+// Bitdepth: 12 Input range:            [       0,     4095]
+//   intermediate range:                [ -192465,   716625]
+//   first pass output range:           [   -6015,    22395]
+//   intermediate range:                [-2105190,  4201830]
+//   second pass output range:          [       0,     4095]
+//   compound second pass output range: [    8129,    57403]
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
+            const int source_width, const int source_height,
+            const int* LIBGAV1_RESTRICT const warp_params,
+            const int subsampling_x, const int subsampling_y,
+            const int block_start_x, const int block_start_y,
+            const int block_width, const int block_height, const int16_t alpha,
+            const int16_t beta, const int16_t gamma, const int16_t delta,
+            void* LIBGAV1_RESTRICT dest, ptrdiff_t dest_stride) {
+  assert(block_width >= 8 && block_height >= 8);
+  if (is_compound) {
+    assert(dest_stride == block_width);
+  }
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      is_compound        ? kInterRoundBitsCompoundVertical
+      : (bitdepth == 12) ? kInterRoundBitsVertical12bpp
+                         : kInterRoundBitsVertical;
+
+  // Only used for 8bpp. Allows for keeping the first pass intermediates within
+  // uint16_t. With 10/12bpp the intermediate value will always require int32_t.
+  constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
+  constexpr int offset_removal =
+      (first_pass_offset >> kRoundBitsHorizontal) * 128;
+
+  constexpr int kMaxPixel = (1 << bitdepth) - 1;
+  union {
+    // |intermediate_result| is the output of the horizontal filtering and
+    // rounding. The range is within int16_t.
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+  const auto* const src = static_cast<const Pixel*>(source);
+  source_stride /= sizeof(Pixel);
+  using DestType =
+      typename std::conditional<is_compound, uint16_t, Pixel>::type;
+  auto* dst = static_cast<DestType*>(dest);
+  if (!is_compound) dest_stride /= sizeof(dst[0]);
+
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block (or smaller).
+  for (int start_y = block_start_y; start_y < block_start_y + block_height;
+       start_y += 8) {
+    for (int start_x = block_start_x; start_x < block_start_x + block_width;
+         start_x += 8) {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const WarpFilterParams filter_params = GetWarpFilterParams(
+          src_x, src_y, subsampling_x, subsampling_y, warp_params);
+
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (filter_params.ix4 - 7 >= source_width - 1 ||
+          filter_params.ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const Pixel* first_row_border =
+            (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (filter_params.iy4 - 7 >= source_height - 1 ||
+            filter_params.iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const Pixel row_border_pixel = first_row_border[row * source_stride];
+          DestType* dst_row = dst + start_x - block_start_x;
+          if (is_compound) {
+            int sum = row_border_pixel
+                      << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
+            sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+            Memset(dst_row, sum, 8);
+          } else {
+            Memset(dst_row, row_border_pixel, 8);
+          }
+          const DestType* const first_dst_row = dst_row;
+          dst_row += dest_stride;
+          for (int y = 1; y < 8; ++y) {
+            memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+            dst_row += dest_stride;
+          }
+          // End of region 1. Continue the |start_x| for loop.
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved below.
+          const int row = filter_params.iy4 + y;
+          int sum = first_row_border[row * source_stride];
+          sum <<= kFilterBits - kRoundBitsHorizontal;
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            int sum = 0;
+            for (int k = 0; k < 8; ++k) {
+              sum +=
+                  kWarpedFilters[offset][k] * intermediate_result_column[y + k];
+            }
+            sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+              dst_row[x] = static_cast<DestType>(sum);
+            } else {
+              dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+            }
+            sy += gamma;
+          }
+          dst_row += dest_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| for loop.
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+      // It follows that -6 <= ix4 <= source_width + 5. This inequality is
+      // used below.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (filter_params.iy4 - 7 >= source_height - 1 ||
+          filter_params.iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const Pixel* const src_row = src + row * source_stride;
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          int sx = sx4 - MultiplyBy4(alpha);
+          for (int x = -4; x < 4; ++x) {
+            const int offset =
+                RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            // Since alpha and beta have been validated by SetupShear(), one
+            // can prove that 0 <= offset <= 3 * 2^6.
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            // For SIMD optimization:
+            // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+            // For 10/12 bit, the range of sum requires 32 bits.
+            int sum = first_pass_offset;
+            for (int k = 0; k < 8; ++k) {
+              // We assume the source frame has left and right borders of at
+              // least 13 pixels that extend the frame boundary pixels.
+              //
+              // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+              // ix4 above, we have
+              //   -13 <= ix4 + x + k - 3 <= source_width + 12,
+              // or
+              //   -13 <= column <= (source_width - 1) + 13.
+              // Therefore we may over-read up to 13 pixels before the source
+              // row, or up to 13 pixels after the source row.
+              const int column = filter_params.ix4 + x + k - 3;
+              sum += kWarpedFilters[offset][k] * src_row[column];
+            }
+            intermediate_result[y + 7][x + 4] =
+                RightShiftWithRounding(sum, kRoundBitsHorizontal);
+            sx += alpha;
+          }
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
+        // It follows that -6 <= iy4 <= source_height + 5. This inequality is
+        // used below.
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We assume the source frame has top and bottom borders of at least
+          // 13 pixels that extend the frame boundary pixels.
+          //
+          // Since -7 <= y <= 7, using the inequality on iy4 above, we have
+          //   -13 <= iy4 + y <= source_height + 12,
+          // or
+          //   -13 <= row <= (source_height - 1) + 13.
+          // Therefore we may over-read up to 13 pixels above the top source
+          // row, or up to 13 pixels below the bottom source row.
+          const int row = filter_params.iy4 + y;
+          const Pixel* const src_row = src + row * source_stride;
+          int sx = sx4 - MultiplyBy4(alpha);
+          for (int x = -4; x < 4; ++x) {
+            const int offset =
+                RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            // Since alpha and beta have been validated by SetupShear(), one
+            // can prove that 0 <= offset <= 3 * 2^6.
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            // For SIMD optimization:
+            // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+            // For 10/12 bit, the range of sum requires 32 bits.
+            int sum = first_pass_offset;
+            for (int k = 0; k < 8; ++k) {
+              // We assume the source frame has left and right borders of at
+              // least 13 pixels that extend the frame boundary pixels.
+              //
+              // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+              // ix4 above, we have
+              //   -13 <= ix4 + x + k - 3 <= source_width + 12,
+              // or
+              //   -13 <= column <= (source_width - 1) + 13.
+              // Therefore we may over-read up to 13 pixels before the source
+              // row, or up to 13 pixels after the source row.
+              const int column = filter_params.ix4 + x + k - 3;
+              sum += kWarpedFilters[offset][k] * src_row[column];
+            }
+            intermediate_result[y + 7][x + 4] =
+                RightShiftWithRounding(sum, kRoundBitsHorizontal) -
+                offset_removal;
+            sx += alpha;
+          }
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                MultiplyBy4(delta);
+      // The spec says we should use the following loop condition:
+      //   y < std::min(4, block_start_y + block_height - start_y - 4);
+      // We can prove that block_start_y + block_height - start_y >= 8, which
+      // implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
+      // So the loop condition is simply y < 4.
+      //
+      //   Proof:
+      //      start_y < block_start_y + block_height
+      //   => block_start_y + block_height - start_y > 0
+      //   => block_height - (start_y - block_start_y) > 0
+      //
+      //   Since block_height >= 8 and is a power of 2, it follows that
+      //   block_height is a multiple of 8. start_y - block_start_y is also a
+      //   multiple of 8. Therefore their difference is a multiple of 8. Since
+      //   their difference is > 0, their difference must be >= 8.
+      //
+      // We then add an offset of 4 to y so that the loop starts with y = 0
+      // and continues if y < 8.
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        // The spec says we should use the following loop condition:
+        //   x < std::min(4, block_start_x + block_width - start_x - 4);
+        // Similar to the above, we can prove that the loop condition can be
+        // simplified to x < 4.
+        //
+        // We then add an offset of 4 to x so that the loop starts with x = 0
+        // and continues if x < 8.
+        for (int x = 0; x < 8; ++x) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          // Since gamma and delta have been validated by SetupShear(), one can
+          // prove that 0 <= offset <= 3 * 2^6.
+          assert(offset >= 0);
+          assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+          int sum = 0;
+          for (int k = 0; k < 8; ++k) {
+            sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
+          }
+          sum -= offset_removal;
+          sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+          if (is_compound) {
+            sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+            dst_row[x] = static_cast<DestType>(sum);
+          } else {
+            dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+          }
+          sy += gamma;
+        }
+        dst_row += dest_stride;
+        sy4 += delta;
+      }
+    }
+    dst += 8 * dest_stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void WarpInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/warp.h b/src/dsp/warp.h
new file mode 100644 (file)
index 0000000..9c20f12
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WARP_H_
+#define LIBGAV1_SRC_DSP_WARP_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/warp_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/warp_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Section 7.11.3.5.
+struct WarpFilterParams {
+  int64_t x4;
+  int64_t y4;
+  int ix4;
+  int iy4;
+};
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_C();
+
+// Section 7.11.3.5.
+inline WarpFilterParams GetWarpFilterParams(int src_x, int src_y,
+                                            int subsampling_x,
+                                            int subsampling_y,
+                                            const int* warp_params) {
+  WarpFilterParams filter_params;
+  // warp_params[2]/[5] require 17 bits (the others 14). With large resolutions
+  // the result of the multiplication will require 33.
+  const int64_t dst_x = static_cast<int64_t>(src_x) * warp_params[2] +
+                        src_y * warp_params[3] + warp_params[0];
+  const int64_t dst_y = src_x * warp_params[4] +
+                        static_cast<int64_t>(src_y) * warp_params[5] +
+                        warp_params[1];
+  filter_params.x4 = dst_x >> subsampling_x;
+  filter_params.y4 = dst_y >> subsampling_y;
+  filter_params.ix4 =
+      static_cast<int>(filter_params.x4 >> kWarpedModelPrecisionBits);
+  filter_params.iy4 =
+      static_cast<int>(filter_params.y4 >> kWarpedModelPrecisionBits);
+  return filter_params;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_WARP_H_
diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc
new file mode 100644 (file)
index 0000000..f93ad8b
--- /dev/null
@@ -0,0 +1,711 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/base/macros.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/post_filter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kSourceBorderHorizontal = 16;
+constexpr int kSourceBorderVertical = 13;
+
+constexpr int kMaxSourceBlockWidth =
+    kMaxSuperBlockSizeInPixels + kSourceBorderHorizontal * 2;
+constexpr int kMaxSourceBlockHeight =
+    kMaxSuperBlockSizeInPixels + kSourceBorderVertical * 2;
+constexpr int kMaxDestBlockWidth =
+    kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+constexpr int kMaxDestBlockHeight =
+    kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+
+constexpr uint16_t kDivisorLookup[257] = {
+    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+    15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+    15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+    14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+    13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+    13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+    13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+    12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+    12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+    11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+    11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+    11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+    10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+    10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+    10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+    9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+    9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+    9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+    9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+    9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+    8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+    8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+    8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+    8240,  8224,  8208,  8192};
+
+template <bool is_compound>
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "77ba358a0f5e19a8e69fa0a95712578e", "141b23d13a04e0b84d26d514de76d6b0",
+      "b0265858454b979852ffadae323f0fb7", "9cf38e3579265b656f1f2100ba15b0e9",
+      "ab51d05cc255ef8e37921182df1d89b1", "e3e96f90a4b07ca733e40f057dc01c41",
+      "4eee8c1a52a62a266db9b1c9338e124c", "901a87d8f88f6324dbc0960a6de861ac",
+      "da9cb6faf6adaeeae12b6784f39186c5", "14450ab05536cdb0d2f499716ccb559d",
+      "566b396cbf008bbb869b364fdc81860d", "681a872baf2de4e58d73ea9ab8643a72",
+      "7f17d290d513a7416761b3a01f10fd2f",
+  };
+  static const char* const kCompoundDigest[] = {
+      "7e9339d265b7beac7bbe32fe7bb0fccb", "f747d663b427bb38a3ff36b0815a394c",
+      "858cf54d2253281a919fbdb48fe91c53", "4721dd97a212c6068bd488f400259afc",
+      "36878c7906492bc740112abdea77616f", "89deb68aa35764bbf3024b501a6bed50",
+      "8ac5b08f9b2afd38143c357646af0f82", "bf6e2a64835ea0c9d7467394253d0eb2",
+      "7b0a539acd2a27eff398dd084abad933", "61c8d81b397c1cf727ff8a9fabab90af",
+      "4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30",
+      "b2a0ce68db3cadd207299f73112bed74",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <bool is_compound>
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1fef54f56a0bafccf7f8da1ac3b18b76", "8a65c72f171feafa2f393d31d6b7fe1b",
+      "808019346f2f1f45f8cf2e9fc9a49320", "c28e2f2c6c830a29bcc2452166cba521",
+      "f040674d6f54e8910d655f0d11fd8cdd", "473af9bb1c6023965c2284b716feef97",
+      "e4f6d7babd0813d5afb0f575ebfa8166", "58f96ef8a880963a213624bb0d06d47c",
+      "1ec0995fa4490628b679d03683233388", "9526fb102fde7dc1a7e160e65af6da33",
+      "f0457427d0c0e31d82ea4f612f7f86f1", "ddc82ae298cccebad493ba9de0f69fbd",
+      "5ed615091e2f62df26de7e91a985cb81",
+  };
+  static const char* const kCompoundDigest[] = {
+      "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+      "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+      "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+      "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+      "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+      "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+      "42eb66e752e9ef289b47053b5c73fdd6",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+template <bool is_compound>
+const char* GetDigest12bpp(int id) {
+  static const char* const kDigest[] = {
+      "cd5d5e2102b8917ad70778f523d24bdf", "374a5f1b53a3fdf2eefa741eb71e6889",
+      "311636841770ec2427084891df96bee5", "c40c537917b1f0d1d84c99dfcecd8219",
+      "a1d9bb920e6c3d20c0cf84adc18e1f15", "13b5659acdb39b717526cb358c6f4026",
+      "f81ea4f6fd1f4ebed1262e3fae37b5bb", "c1452fefcd9b9562fe3a0b7f9302809c",
+      "8fed8a3159dc7b6b59a39ab2be6bee13", "b46458bc0e5cf1cee92aac4f0f608749",
+      "2e6a1039ab111add89f5b44b13565f40", "9c666691860bdc89b03f601b40126196",
+      "418a47157d992b94c302ca2e2f6ee07e",
+  };
+  static const char* const kCompoundDigest[] = {
+      "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+      "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+      "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+      "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+      "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+      "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+      "42eb66e752e9ef289b47053b5c73fdd6",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+int RandomWarpedParam(int seed_offset, int bits) {
+  libvpx_test::ACMRandom rnd(seed_offset +
+                             libvpx_test::ACMRandom::DeterministicSeed());
+  // 1 in 8 chance of generating zero (arbitrary).
+  const bool zero = (rnd.Rand16() & 7) == 0;
+  if (zero) return 0;
+  // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 <<
+  // bits].
+  const int mask = (1 << bits) - 1;
+  const int value = 1 + (rnd.RandRange(1u << 31) & mask);
+  const bool sign = (rnd.Rand16() & 1) != 0;
+  return sign ? value : -value;
+}
+
+// This function is a copy from warp_prediction.cc.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+                                int16_t* division_shift) {
+  const int n = FloorLog2(std::abs(value));
+  const T e = std::abs(value) - (static_cast<T>(1) << n);
+  const int entry = (n > kDivisorLookupBits)
+                        ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+                        : static_cast<int>(e << (kDivisorLookupBits - n));
+  *division_shift = n + kDivisorLookupPrecisionBits;
+  *division_factor =
+      (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// This function is a copy from warp_prediction.cc.
+int16_t GetShearParameter(int value) {
+  return static_cast<int16_t>(
+      LeftShift(RightShiftWithRoundingSigned(value, kWarpParamRoundingBits),
+                kWarpParamRoundingBits));
+}
+
+// This function is a copy from warp_prediction.cc.
+// This function is used here to help generate valid warp parameters.
+bool SetupShear(const int* params, int16_t* alpha, int16_t* beta,
+                int16_t* gamma, int16_t* delta) {
+  int16_t division_shift;
+  int16_t division_factor;
+  GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+                                      &division_shift);
+  const int alpha0 =
+      Clip3(params[2] - (1 << kWarpedModelPrecisionBits), INT16_MIN, INT16_MAX);
+  const int beta0 = Clip3(params[3], INT16_MIN, INT16_MAX);
+  const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+  const int gamma0 =
+      Clip3(RightShiftWithRoundingSigned(v * division_factor, division_shift),
+            INT16_MIN, INT16_MAX);
+  const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+  const int delta0 = Clip3(
+      params[5] -
+          RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+          (1 << kWarpedModelPrecisionBits),
+      INT16_MIN, INT16_MAX);
+
+  *alpha = GetShearParameter(alpha0);
+  *beta = GetShearParameter(beta0);
+  *gamma = GetShearParameter(gamma0);
+  *delta = GetShearParameter(delta0);
+  if ((4 * std::abs(*alpha) + 7 * std::abs(*beta) >=
+       (1 << kWarpedModelPrecisionBits)) ||
+      (4 * std::abs(*gamma) + 4 * std::abs(*delta) >=
+       (1 << kWarpedModelPrecisionBits))) {
+    return false;  // NOLINT (easier condition to understand).
+  }
+
+  return true;
+}
+
+void GenerateWarpedModel(int* params, int16_t* alpha, int16_t* beta,
+                         int16_t* gamma, int16_t* delta, int seed) {
+  do {
+    params[0] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    params[1] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    params[2] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+                (1 << kWarpedModelPrecisionBits);
+    params[3] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    params[4] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    params[5] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+                (1 << kWarpedModelPrecisionBits);
+    ++seed;
+  } while (params[2] == 0 || !SetupShear(params, alpha, beta, gamma, delta));
+}
+
+struct WarpTestParam {
+  WarpTestParam(int width, int height) : width(width), height(height) {}
+  int width;
+  int height;
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+class WarpTest : public testing::TestWithParam<WarpTestParam> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  WarpTest() = default;
+  ~WarpTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    WarpInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      WarpInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      WarpInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = is_compound ? dsp->warp_compound : dsp->warp;
+  }
+
+ protected:
+  using DestType =
+      typename std::conditional<is_compound, uint16_t, Pixel>::type;
+
+  void SetInputData(bool use_fixed_values, int value);
+  void Test(bool use_fixed_values, int value, int num_runs = 1);
+  void TestFixedValues();
+  void TestRandomValues();
+  void TestSpeed();
+
+  const WarpTestParam param_ = GetParam();
+
+ private:
+  int warp_params_[8];
+  dsp::WarpFunc func_;
+  // Warp filters are 7-tap, which needs 3 pixels (kConvolveBorderLeftTop)
+  // padding. Destination buffer indices are based on subsampling values (x+y):
+  // 0: (4:4:4), 1:(4:2:2), 2: (4:2:0).
+  Pixel source_[kMaxSourceBlockHeight * kMaxSourceBlockWidth] = {};
+  DestType dest_[3][kMaxDestBlockHeight * kMaxDestBlockWidth] = {};
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                          int value) {
+  if (use_fixed_values) {
+    for (int y = 0; y < param_.height; ++y) {
+      const int row = kSourceBorderVertical + y;
+      Memset(source_ + row * kMaxSourceBlockWidth + kSourceBorderHorizontal,
+             value, param_.width);
+    }
+  } else {
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    for (int y = 0; y < param_.height; ++y) {
+      const int row = kSourceBorderVertical + y;
+      for (int x = 0; x < param_.width; ++x) {
+        const int column = kSourceBorderHorizontal + x;
+        source_[row * kMaxSourceBlockWidth + column] = rnd.Rand16() & mask;
+      }
+    }
+  }
+  PostFilter::ExtendFrame<Pixel>(
+      &source_[kSourceBorderVertical * kMaxSourceBlockWidth +
+               kSourceBorderHorizontal],
+      param_.width, param_.height, kMaxSourceBlockWidth,
+      kSourceBorderHorizontal, kSourceBorderHorizontal, kSourceBorderVertical,
+      kSourceBorderVertical);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values,
+                                                  int value,
+                                                  int num_runs /*= 1*/) {
+  if (func_ == nullptr) return;
+  SetInputData(use_fixed_values, value);
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int source_offset =
+      kSourceBorderVertical * kMaxSourceBlockWidth + kSourceBorderHorizontal;
+  const int dest_offset =
+      kConvolveBorderLeftTop * kMaxDestBlockWidth + kConvolveBorderLeftTop;
+  const Pixel* const src = source_ + source_offset;
+  const ptrdiff_t src_stride = kMaxSourceBlockWidth * sizeof(Pixel);
+  const ptrdiff_t dst_stride =
+      is_compound ? kMaxDestBlockWidth : kMaxDestBlockWidth * sizeof(Pixel);
+
+  absl::Duration elapsed_time;
+  for (int subsampling_x = 0; subsampling_x <= 1; ++subsampling_x) {
+    for (int subsampling_y = 0; subsampling_y <= 1; ++subsampling_y) {
+      if (subsampling_x == 0 && subsampling_y == 1) {
+        // When both are 0: 4:4:4
+        // When both are 1: 4:2:0
+        // When only |subsampling_x| is 1: 4:2:2
+        // Having only |subsampling_y| == 1 is unsupported.
+        continue;
+      }
+      int params[8];
+      int16_t alpha;
+      int16_t beta;
+      int16_t gamma;
+      int16_t delta;
+      GenerateWarpedModel(params, &alpha, &beta, &gamma, &delta, rnd.Rand8());
+
+      const int dest_id = subsampling_x + subsampling_y;
+      DestType* const dst = dest_[dest_id] + dest_offset;
+      const absl::Time start = absl::Now();
+      for (int n = 0; n < num_runs; ++n) {
+        func_(src, src_stride, param_.width, param_.height, params,
+              subsampling_x, subsampling_y, 0, 0, param_.width, param_.height,
+              alpha, beta, gamma, delta, dst, dst_stride);
+      }
+      elapsed_time += absl::Now() - start;
+    }
+  }
+
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(dest_); ++i) {
+      // |is_compound| holds a few more bits of precision and an offset value.
+      Pixel compensated_dest[kMaxDestBlockWidth * kMaxDestBlockHeight];
+      const int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+      if (is_compound) {
+        for (int y = 0; y < param_.height; ++y) {
+          for (int x = 0; x < param_.width; ++x) {
+            const int compound_value =
+                dest_[i][dest_offset + y * kMaxDestBlockWidth + x];
+            const int remove_offset = compound_value - compound_offset;
+            const int full_shift =
+                remove_offset >>
+                (kInterRoundBitsVertical - kInterRoundBitsCompoundVertical);
+            compensated_dest[y * kMaxDestBlockWidth + x] =
+                Clip3(full_shift, 0, (1 << bitdepth) - 1);
+          }
+        }
+      }
+      Pixel* pixel_dest =
+          is_compound ? compensated_dest
+                      : reinterpret_cast<Pixel*>(dest_[i] + dest_offset);
+      const bool success = test_utils::CompareBlocks(
+          src, pixel_dest, param_.width, param_.height, kMaxSourceBlockWidth,
+          kMaxDestBlockWidth, false);
+      EXPECT_TRUE(success) << "subsampling_x + subsampling_y: " << i;
+    }
+  } else {
+    // (width, height):
+    // (8, 8), id = 0. (8, 16), id = 1. (16, 8), id = 2.
+    // (16, 16), id = 3. (16, 32), id = 4. (32, 16), id = 5.
+    // ...
+    // (128, 128), id = 12.
+    int id;
+    if (param_.width == param_.height) {
+      id = 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+    } else if (param_.width < param_.height) {
+      id = 1 + 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+    } else {
+      id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3);
+    }
+
+    const char* expected_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        expected_digest = GetDigest8bpp<is_compound>(id);
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        expected_digest = GetDigest10bpp<is_compound>(id);
+        break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+      case 12:
+        expected_digest = GetDigest12bpp<is_compound>(id);
+        break;
+#endif
+    }
+    ASSERT_NE(expected_digest, nullptr);
+    test_utils::CheckMd5Digest(
+        "Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+        expected_digest, dest_, sizeof(dest_), elapsed_time);
+  }
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestFixedValues() {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << bitdepth) - 1);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestRandomValues() {
+  Test(false, 0);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestSpeed() {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+                              const int8_t filter[kSubPixelTaps],
+                              int* min_output, int* max_output) {
+  int min = 0, max = 0;
+  for (int i = 0; i < kSubPixelTaps; ++i) {
+    const int tap = filter[i];
+    if (tap > 0) {
+      max += max_input * tap;
+      min += min_input * tap;
+    } else {
+      min += max_input * tap;
+      max += min_input * tap;
+    }
+  }
+  *min_output = min;
+  *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+                                const int8_t filter[kSubPixelTaps],
+                                int* min_output, int* max_output) {
+  ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Warp process.
+template <int bitdepth>
+void ShowRange() {
+  constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+                                      ? kInterRoundBitsHorizontal12bpp
+                                      : kInterRoundBitsHorizontal;
+  constexpr int vertical_bits = (bitdepth == kBitdepth12)
+                                    ? kInterRoundBitsVertical12bpp
+                                    : kInterRoundBitsVertical;
+  constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical;
+
+  constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+  constexpr int max_input = (1 << bitdepth) - 1;
+
+  const int8_t* worst_warp_filter = kWarpedFilters8[93];
+
+  // First pass.
+  printf("Bitdepth: %2d Input range:            [%8d, %8d]\n", bitdepth, 0,
+         max_input);
+
+  int min = 0, max = 0;
+  ApplyFilterToUnsignedInput(max_input, worst_warp_filter, &min, &max);
+
+  int first_pass_offset;
+  if (bitdepth == 8) {
+    // Derive an offset for 8 bit.
+    for (first_pass_offset = 1; - first_pass_offset > min;
+         first_pass_offset <<= 1) {
+    }
+    printf("  8bpp intermediate offset: %d.\n", first_pass_offset);
+    min += first_pass_offset;
+    max += first_pass_offset;
+    assert(min > 0);
+    assert(max < UINT16_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+    // offset is not required.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+  const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+  printf("  first pass output range:           [%8d, %8d]\n", first_pass_min,
+         first_pass_max);
+
+  // Second pass.
+  if (bitdepth == 8) {
+    ApplyFilterToUnsignedInput(first_pass_max, worst_warp_filter, &min, &max);
+  } else {
+    ApplyFilterToSignedInput(first_pass_min, first_pass_max, worst_warp_filter,
+                             &min, &max);
+  }
+
+  if (bitdepth == 8) {
+    // Remove the offset that was applied in the first pass since we must use
+    // int32_t for this phase anyway. 128 is the sum of the filter taps.
+    const int offset_removal = (first_pass_offset >> horizontal_bits) * 128;
+    printf("  8bpp intermediate offset removal: %d.\n", offset_removal);
+    max -= offset_removal;
+    min -= offset_removal;
+    assert(min < INT16_MIN && min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+    // offset is not required.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  // Second pass non-compound output is clipped to Pixel values.
+  const int second_pass_min =
+      Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+  const int second_pass_max =
+      Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+  printf("  second pass output range:          [%8d, %8d]\n", second_pass_min,
+         second_pass_max);
+
+  // Output is Pixel so matches Pixel values.
+  assert(second_pass_min == 0);
+  assert(second_pass_max == max_input);
+
+  const int compound_second_pass_min =
+      RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+  const int compound_second_pass_max =
+      RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+  printf("  compound second pass output range: [%8d, %8d]\n",
+         compound_second_pass_min, compound_second_pass_max);
+
+  if (bitdepth == 8) {
+    // 8bpp output is int16_t without an offset.
+    assert(compound_second_pass_min > INT16_MIN);
+    assert(compound_second_pass_max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp use the offset to fit inside uint16_t.
+    assert(compound_second_pass_min > 0);
+    assert(compound_second_pass_max < UINT16_MAX);
+  }
+
+  printf("\n");
+}
+
+TEST(WarpTest, ShowRange) {
+  ShowRange<kBitdepth8>();
+  ShowRange<kBitdepth10>();
+  ShowRange<kBitdepth12>();
+}
+
+using WarpTest8bpp = WarpTest</*is_compound=*/false, 8, uint8_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest8bpp = WarpTest</*is_compound=*/true, 8, uint8_t>;
+
+// Verifies the sum of the warped filter coefficients is 128 for every filter.
+//
+// Verifies the properties used in the calculation of ranges of variables in
+// the block warp process:
+// * The maximum sum of the positive warped filter coefficients is 175.
+// * The minimum (i.e., most negative) sum of the negative warped filter
+//   coefficients is -47.
+//
+// NOTE: This test is independent of the bitdepth and the implementation of the
+// block warp function, so it just needs to be a test in the WarpTest8bpp class
+// and does not need to be defined with TEST_P.
+TEST(WarpTest8bpp, WarpedFilterCoefficientSums) {
+  int max_positive_sum = 0;
+  int min_negative_sum = 0;
+  for (const auto& filter : kWarpedFilters) {
+    int sum = 0;
+    int positive_sum = 0;
+    int negative_sum = 0;
+    for (const auto coefficient : filter) {
+      sum += coefficient;
+      if (coefficient > 0) {
+        positive_sum += coefficient;
+      } else {
+        negative_sum += coefficient;
+      }
+    }
+    EXPECT_EQ(sum, 128);
+    max_positive_sum = std::max(positive_sum, max_positive_sum);
+    min_negative_sum = std::min(negative_sum, min_negative_sum);
+  }
+  EXPECT_EQ(max_positive_sum, 175);
+  EXPECT_EQ(min_negative_sum, -47);
+}
+
+TEST_P(WarpTest8bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest8bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest8bpp, DISABLED_Speed) { TestSpeed(); }
+const WarpTestParam warp_test_param[] = {
+    WarpTestParam(8, 8),     WarpTestParam(8, 16),   WarpTestParam(16, 8),
+    WarpTestParam(16, 16),   WarpTestParam(16, 32),  WarpTestParam(32, 16),
+    WarpTestParam(32, 32),   WarpTestParam(32, 64),  WarpTestParam(64, 32),
+    WarpTestParam(64, 64),   WarpTestParam(64, 128), WarpTestParam(128, 64),
+    WarpTestParam(128, 128),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest8bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest8bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WarpTest8bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WarpTest10bpp = WarpTest</*is_compound=*/false, 10, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest10bpp = WarpTest</*is_compound=*/true, 10, uint16_t>;
+
+TEST_P(WarpTest10bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest10bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest10bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest10bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WarpTest12bpp = WarpTest</*is_compound=*/false, 12, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest12bpp = WarpTest</*is_compound=*/true, 12, uint16_t>;
+
+TEST_P(WarpTest12bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest12bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest12bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest12bpp, testing::ValuesIn(warp_test_param));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) {
+  return os << "BlockSize" << warp_param.width << "x" << warp_param.height;
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc
new file mode 100644 (file)
index 0000000..ee3808b
--- /dev/null
@@ -0,0 +1,310 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int width, int height, int bitdepth, bool mask_is_inverse>
+void WeightMask_C(const void* LIBGAV1_RESTRICT prediction_0,
+                  const void* LIBGAV1_RESTRICT prediction_1,
+                  uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  static_assert(width >= 8, "");
+  static_assert(height >= 8, "");
+  constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const int difference = RightShiftWithRounding(
+          std::abs(pred_0[x] - pred_1[x]), rounding_bits);
+      const auto mask_value =
+          static_cast<uint8_t>(std::min(DivideBy16(difference) + 38, 64));
+      mask[x] = mask_is_inverse ? 64 - mask_value : mask_value;
+    }
+    pred_0 += width;
+    pred_1 += width;
+    mask += mask_stride;
+  }
+}
+
+#define INIT_WEIGHT_MASK(width, height, bitdepth, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                           \
+      WeightMask_C<width, height, bitdepth, 0>;                     \
+  dsp->weight_mask[w_index][h_index][1] =                           \
+      WeightMask_C<width, height, bitdepth, 1>
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+void WeightMaskInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/weight_mask.h b/src/dsp/weight_mask.h
new file mode 100644 (file)
index 0000000..43bef05
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+#define LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/weight_mask_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/weight_mask_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc
new file mode 100644 (file)
index 0000000..a080ec4
--- /dev/null
@@ -0,0 +1,468 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+constexpr int kMaxPredictionSize = 128;
+// weight_mask is only used with kCompoundPredictionTypeDiffWeighted with
+// convolve producing the most extreme ranges.
+// This includes kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "eaca5b6a96dcfe5e44f3926a071b48b3",
+      "1d82c75cfdf8e57925eb1d5301647538",
+      "25bd455d74fb891b97b133c528f8db60",
+      "" /*kBlock4x16*/,
+      "1d82c75cfdf8e57925eb1d5301647538",
+      "25bd455d74fb891b97b133c528f8db60",
+      "62a08776db35a186406a11ab92dee71c",
+      "95131d1dc0e05fcf4bd234d5ce9eea11",
+      "25bd455d74fb891b97b133c528f8db60",
+      "62a08776db35a186406a11ab92dee71c",
+      "95131d1dc0e05fcf4bd234d5ce9eea11",
+      "0b3c75272e0fb0747b9850145d340c4c",
+      "95131d1dc0e05fcf4bd234d5ce9eea11",
+      "0b3c75272e0fb0747b9850145d340c4c",
+      "f26c43d4bc823a89c1ed47ab8708bc06",
+      "0d99bbf31ecddc1c2d5063a68c0e9375",
+      "0d99bbf31ecddc1c2d5063a68c0e9375",
+      "5fb8ec5f582f0ebfe519ed55860f67c4",
+
+      // mask_is_inverse = true.
+      "96811f3b192828ff679e4c9ad8069d7d",
+      "a04dc180c028d55af70240163445523a",
+      "8513e3988233d0a7de316a0179bb6139",
+      "" /*kBlock4x16*/,
+      "a04dc180c028d55af70240163445523a",
+      "8513e3988233d0a7de316a0179bb6139",
+      "f7356d42fb44a6ccb41253ba35b8b3c7",
+      "3d2d61ffc203ee64fe91c9d16168a19d",
+      "8513e3988233d0a7de316a0179bb6139",
+      "f7356d42fb44a6ccb41253ba35b8b3c7",
+      "3d2d61ffc203ee64fe91c9d16168a19d",
+      "87a2011ac69fb597ca4f71bb3c35ebb0",
+      "3d2d61ffc203ee64fe91c9d16168a19d",
+      "87a2011ac69fb597ca4f71bb3c35ebb0",
+      "97100a3639d567046dc8a99fcb84cb2e",
+      "9fabe05a6523da81a45150e19f75acff",
+      "9fabe05a6523da81a45150e19f75acff",
+      "7c0643e4d02421d06d7ca71822a94e1d",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "5ae8d64b65a671301a457b8a73368ab5",
+      "61535217f179054d4b76a8d9352a223d",
+      "1aa6614773570e7b021cd509849c4180",
+      "" /*kBlock4x16*/,
+      "61535217f179054d4b76a8d9352a223d",
+      "1aa6614773570e7b021cd509849c4180",
+      "f04c2825cfb6408c7778658f71fa176e",
+      "e1694ea1f026dac7fe7e86a84482cf86",
+      "1aa6614773570e7b021cd509849c4180",
+      "f04c2825cfb6408c7778658f71fa176e",
+      "e1694ea1f026dac7fe7e86a84482cf86",
+      "9c4855d44c013fbddb373b2e9e311080",
+      "e1694ea1f026dac7fe7e86a84482cf86",
+      "9c4855d44c013fbddb373b2e9e311080",
+      "f510e743c3efe3b83374a98ef8a30838",
+      "b6e0bd03c521c5f00e90530daa7d4432",
+      "b6e0bd03c521c5f00e90530daa7d4432",
+      "3270d7f621d488aec5b76bcf121debd0",
+
+      // mask_is_inverse = true.
+      "9aa00fcfe21b71e30c5393699122a020",
+      "4d8ce33262cf6b5375f363530815189a",
+      "428625c51ac1bd4585988f7b36dff1db",
+      "" /*kBlock4x16*/,
+      "4d8ce33262cf6b5375f363530815189a",
+      "428625c51ac1bd4585988f7b36dff1db",
+      "1ef63c06a2d9c42da293fdf924032981",
+      "5dd3f201d755d1c22c126a633bfbb3c0",
+      "428625c51ac1bd4585988f7b36dff1db",
+      "1ef63c06a2d9c42da293fdf924032981",
+      "5dd3f201d755d1c22c126a633bfbb3c0",
+      "fe1e6843e6f214939da516dcbea04a79",
+      "5dd3f201d755d1c22c126a633bfbb3c0",
+      "fe1e6843e6f214939da516dcbea04a79",
+      "240187f27389b5e89f9ec6bdbd7d20a7",
+      "44925dab01011a98b8ab1f0308fa852a",
+      "44925dab01011a98b8ab1f0308fa852a",
+      "6d984b2ccfa056278e2130771127a943",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+  static const char* const kDigest[] = {
+      "57629d3872fd52ff4bbec439c5517ec5",
+      "dba421ceeb534756c77167e00ae91a2c",
+      "72e8ac1d450ef0c6c6b03e93856d5cc2",
+      "" /*kBlock4x16*/,
+      "dba421ceeb534756c77167e00ae91a2c",
+      "72e8ac1d450ef0c6c6b03e93856d5cc2",
+      "ae573eb368df04e6a0133b4e15471728",
+      "ceede597b2729357b15e0d08bb9bb760",
+      "72e8ac1d450ef0c6c6b03e93856d5cc2",
+      "ae573eb368df04e6a0133b4e15471728",
+      "ceede597b2729357b15e0d08bb9bb760",
+      "c4976af803d7ad3f92ef26f25b9f3754",
+      "ceede597b2729357b15e0d08bb9bb760",
+      "c4976af803d7ad3f92ef26f25b9f3754",
+      "1d957d49f71bb7f304705a11a597f0cb",
+      "9522d5713fb951b79f42d78fbff914cf",
+      "9522d5713fb951b79f42d78fbff914cf",
+      "422c046013f79a9f46e2c855967570ba",
+
+      // mask_is_inverse = true.
+      "a585cca9bc459d10e081bc0eb847b6e3",
+      "2fa4ec5f74fad2831d216c51c2cdad5a",
+      "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+      "" /*kBlock4x16*/,
+      "2fa4ec5f74fad2831d216c51c2cdad5a",
+      "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+      "2ddd8c8a1841501964011030e2557e20",
+      "97ef2575023dda008711015cf08d7590",
+      "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+      "2ddd8c8a1841501964011030e2557e20",
+      "97ef2575023dda008711015cf08d7590",
+      "d69aff1e0d43395ce305c9be0dfb4c89",
+      "97ef2575023dda008711015cf08d7590",
+      "d69aff1e0d43395ce305c9be0dfb4c89",
+      "48786f640191dcbee5b3321672778519",
+      "6ad4718230353440b01f2bb78348157e",
+      "6ad4718230353440b01f2bb78348157e",
+      "ad49bd7af0ea17c84f434c7dfd0a911d",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct WeightMaskTestParam {
+  WeightMaskTestParam(int width, int height, bool mask_is_inverse)
+      : width(width), height(height), mask_is_inverse(mask_is_inverse) {}
+  int width;
+  int height;
+  bool mask_is_inverse;
+};
+
+std::ostream& operator<<(std::ostream& os, const WeightMaskTestParam& param) {
+  return os << param.width << "x" << param.height
+            << ", mask_is_inverse: " << param.mask_is_inverse;
+}
+
+template <int bitdepth>
+class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>,
+                       public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  WeightMaskTest() = default;
+  ~WeightMaskTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    WeightMaskInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const int width_index = FloorLog2(width_) - 3;
+    const int height_index = FloorLog2(height_) - 3;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      WeightMaskInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      WeightMaskInit_SSE4_1();
+    }
+    func_ = dsp->weight_mask[width_index][height_index][mask_is_inverse_];
+  }
+
+ protected:
+  void SetInputData(bool use_fixed_values, int value_1, int value_2);
+  void Test(int num_runs, bool use_fixed_values, int value_1, int value_2);
+
+ private:
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  const bool mask_is_inverse_ = GetParam().mask_is_inverse;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  alignas(
+      kMaxAlignment) PredType block_1_[kMaxPredictionSize * kMaxPredictionSize];
+  alignas(
+      kMaxAlignment) PredType block_2_[kMaxPredictionSize * kMaxPredictionSize];
+  uint8_t mask_[kMaxPredictionSize * kMaxPredictionSize] = {};
+  dsp::WeightMaskFunc func_;
+};
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::SetInputData(const bool use_fixed_values,
+                                            const int value_1,
+                                            const int value_2) {
+  if (use_fixed_values) {
+    std::fill(block_1_, block_1_ + kMaxPredictionSize * kMaxPredictionSize,
+              value_1);
+    std::fill(block_2_, block_2_ + kMaxPredictionSize * kMaxPredictionSize,
+              value_2);
+  } else {
+    constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+        const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+        block_1_[y * width_ + x] =
+            static_cast<PredType>(rnd(max_val - min_val) + min_val);
+        block_2_[y * width_ + x] =
+            static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      }
+    }
+  }
+}
+
+BlockSize DimensionsToBlockSize(int width, int height) {
+  if (width == 4) {
+    if (height == 4) return kBlock4x4;
+    if (height == 8) return kBlock4x8;
+    if (height == 16) return kBlock4x16;
+    return kBlockInvalid;
+  }
+  if (width == 8) {
+    if (height == 4) return kBlock8x4;
+    if (height == 8) return kBlock8x8;
+    if (height == 16) return kBlock8x16;
+    if (height == 32) return kBlock8x32;
+    return kBlockInvalid;
+  }
+  if (width == 16) {
+    if (height == 4) return kBlock16x4;
+    if (height == 8) return kBlock16x8;
+    if (height == 16) return kBlock16x16;
+    if (height == 32) return kBlock16x32;
+    if (height == 64) return kBlock16x64;
+    return kBlockInvalid;
+  }
+  if (width == 32) {
+    if (height == 8) return kBlock32x8;
+    if (height == 16) return kBlock32x16;
+    if (height == 32) return kBlock32x32;
+    if (height == 64) return kBlock32x64;
+    return kBlockInvalid;
+  }
+  if (width == 64) {
+    if (height == 16) return kBlock64x16;
+    if (height == 32) return kBlock64x32;
+    if (height == 64) return kBlock64x64;
+    if (height == 128) return kBlock64x128;
+    return kBlockInvalid;
+  }
+  if (width == 128) {
+    if (height == 64) return kBlock128x64;
+    if (height == 128) return kBlock128x128;
+    return kBlockInvalid;
+  }
+  return kBlockInvalid;
+}
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::Test(const int num_runs,
+                                    const bool use_fixed_values,
+                                    const int value_1, const int value_2) {
+  if (func_ == nullptr) return;
+  SetInputData(use_fixed_values, value_1, value_2);
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    func_(block_1_, block_2_, mask_, width_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (use_fixed_values) {
+    int fixed_value = (value_1 - value_2 == 0) ? 38 : 64;
+    if (mask_is_inverse_) fixed_value = 64 - fixed_value;
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        ASSERT_EQ(static_cast<int>(mask_[y * width_ + x]), fixed_value)
+            << "x: " << x << " y: " << y;
+      }
+    }
+  } else {
+    const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0;
+    const int id = id_offset +
+                   static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4;
+    const char* expected_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        expected_digest = GetDigest8bpp(id);
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        expected_digest = GetDigest10bpp(id);
+        break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+      case 12:
+        expected_digest = GetDigest12bpp(id);
+        break;
+#endif
+    }
+    ASSERT_NE(expected_digest, nullptr);
+    test_utils::CheckMd5Digest(
+        absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+        "WeightMask", expected_digest, mask_, sizeof(mask_), elapsed_time);
+  }
+}
+
+const WeightMaskTestParam weight_mask_test_param[] = {
+    WeightMaskTestParam(8, 8, false),     WeightMaskTestParam(8, 16, false),
+    WeightMaskTestParam(8, 32, false),    WeightMaskTestParam(16, 8, false),
+    WeightMaskTestParam(16, 16, false),   WeightMaskTestParam(16, 32, false),
+    WeightMaskTestParam(16, 64, false),   WeightMaskTestParam(32, 8, false),
+    WeightMaskTestParam(32, 16, false),   WeightMaskTestParam(32, 32, false),
+    WeightMaskTestParam(32, 64, false),   WeightMaskTestParam(64, 16, false),
+    WeightMaskTestParam(64, 32, false),   WeightMaskTestParam(64, 64, false),
+    WeightMaskTestParam(64, 128, false),  WeightMaskTestParam(128, 64, false),
+    WeightMaskTestParam(128, 128, false), WeightMaskTestParam(8, 8, true),
+    WeightMaskTestParam(8, 16, true),     WeightMaskTestParam(8, 32, true),
+    WeightMaskTestParam(16, 8, true),     WeightMaskTestParam(16, 16, true),
+    WeightMaskTestParam(16, 32, true),    WeightMaskTestParam(16, 64, true),
+    WeightMaskTestParam(32, 8, true),     WeightMaskTestParam(32, 16, true),
+    WeightMaskTestParam(32, 32, true),    WeightMaskTestParam(32, 64, true),
+    WeightMaskTestParam(64, 16, true),    WeightMaskTestParam(64, 32, true),
+    WeightMaskTestParam(64, 64, true),    WeightMaskTestParam(64, 128, true),
+    WeightMaskTestParam(128, 64, true),   WeightMaskTestParam(128, 128, true),
+};
+
+using WeightMaskTest8bpp = WeightMaskTest<8>;
+
+TEST_P(WeightMaskTest8bpp, FixedValues) {
+  const int min = kCompoundPredictionRange[0][0];
+  const int max = kCompoundPredictionRange[0][1];
+  Test(1, true, min, min);
+  Test(1, true, min, max);
+  Test(1, true, max, min);
+  Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest8bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest8bpp, DISABLED_Speed) {
+  Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WeightMaskTest10bpp = WeightMaskTest<10>;
+
+TEST_P(WeightMaskTest10bpp, FixedValues) {
+  const int min = kCompoundPredictionRange[1][0];
+  const int max = kCompoundPredictionRange[1][1];
+  Test(1, true, min, min);
+  Test(1, true, min, max);
+  Test(1, true, max, min);
+  Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest10bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest10bpp, DISABLED_Speed) {
+  Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WeightMaskTest12bpp = WeightMaskTest<12>;
+
+TEST_P(WeightMaskTest12bpp, FixedValues) {
+  const int min = kCompoundPredictionRange[2][0];
+  const int max = kCompoundPredictionRange[2][1];
+  Test(1, true, min, min);
+  Test(1, true, min, max);
+  Test(1, true, max, min);
+  Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest12bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest12bpp, DISABLED_Speed) {
+  Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest12bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
new file mode 100644 (file)
index 0000000..c08b3d6
--- /dev/null
@@ -0,0 +1,396 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                               const int16_t* LIBGAV1_RESTRICT prediction_1,
+                               uint8_t* LIBGAV1_RESTRICT dest,
+                               const ptrdiff_t dest_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+  res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+  res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+  const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+  Store4(dest, result_pixels);
+  dest += dest_stride;
+  const int result_1 = _mm_extract_epi32(result_pixels, 1);
+  memcpy(dest, &result_1, sizeof(result_1));
+  dest += dest_stride;
+  const int result_2 = _mm_extract_epi32(result_pixels, 2);
+  memcpy(dest, &result_2, sizeof(result_2));
+  dest += dest_stride;
+  const int result_3 = _mm_extract_epi32(result_pixels, 3);
+  memcpy(dest, &result_3, sizeof(result_3));
+}
+
+inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                             const int16_t* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT dest,
+                             const ptrdiff_t dest_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+  res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+  res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+  const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+  StoreLo8(dest, result_pixels);
+  StoreHi8(dest + dest_stride, result_pixels);
+}
+
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint8_t* LIBGAV1_RESTRICT dest) {
+  int x = 0;
+  do {
+    const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
+    const __m128i pred_01 = LoadAligned16(&prediction_1[x]);
+    __m128i res0 = _mm_add_epi16(pred_00, pred_01);
+    res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1);
+    const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]);
+    const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]);
+    __m128i res1 = _mm_add_epi16(pred_10, pred_11);
+    res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1);
+    StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1));
+    x += 16;
+  } while (x < width);
+}
+
+void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         const int width, const int height,
+                         void* LIBGAV1_RESTRICT const dest,
+                         const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = height;
+
+  if (width == 4) {
+    const ptrdiff_t dest_stride4 = dest_stride << 2;
+    constexpr ptrdiff_t width4 = 4 << 2;
+    do {
+      AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride);
+      dst += dest_stride4;
+      pred_0 += width4;
+      pred_1 += width4;
+
+      y -= 4;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    constexpr ptrdiff_t width2 = 8 << 1;
+    do {
+      AverageBlend8Row(pred_0, pred_1, dst, dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend)
+  dsp->average_blend = AverageBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                            const uint16_t* LIBGAV1_RESTRICT prediction_1,
+                            const __m128i& compound_offset,
+                            const __m128i& round_offset, const __m128i& max,
+                            const __m128i& zero, uint16_t* LIBGAV1_RESTRICT dst,
+                            const ptrdiff_t dest_stride) {
+  // pred_0/1 max range is 16b.
+  const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+  const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+  const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+  const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+  const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+  const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+  const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+  const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+  const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+  const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+  // RightShiftWithRounding and Clip3.
+  const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+  const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+  const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+  const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+  if (width != 4) {
+    // Store width=8/16/32/64/128.
+    StoreUnaligned16(dst + offset, result);
+    return;
+  }
+  assert(width == 4);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                              const void* LIBGAV1_RESTRICT prediction_1,
+                              const int width, const int height,
+                              void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const __m128i compound_offset =
+      _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+  const __m128i round_offset =
+      _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+  const __m128i zero = _mm_setzero_si128();
+  int y = height;
+
+  if (width == 4) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0,1
+      AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 8) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 16) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 32) {
+    do {
+      // pred [0 - 15].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // pred [16 - 31].
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  if (width == 64) {
+    do {
+      // pred [0 - 31].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      // pred [31 - 63].
+      AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  assert(width == 128);
+  do {
+    // pred [0 - 31].
+    AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [31 - 63].
+    AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+
+    // pred [64 - 95].
+    AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [96 - 127].
+    AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+  dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h
new file mode 100644 (file)
index 0000000..cd07112
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc
new file mode 100644 (file)
index 0000000..01a2b9f
--- /dev/null
@@ -0,0 +1,788 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
+    420, 210, 140, 105, 420, 210, 140, 105,
+    105, 105, 105, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  // 00 01 02 03 04 05 06 07
+  *partial_lo = v_src_16[0];
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 10 11 12 13 14 15 16
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
+  // 17 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_d1_temp[8];
+  const __m256i v_zero = _mm256_setzero_si256();
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = v_zero;
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
+  v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
+  v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
+  v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+                                      ptrdiff_t stride, __m256i* partial) {
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  __m256i v_src[8];
+  for (auto& i : v_src) {
+    i = _mm256_castsi128_si256(LoadLo8(src));
+    // Dup lower lane.
+    i = _mm256_permute2x128_si256(i, i, 0x0);
+    src += stride;
+  }
+
+  const __m256i v_zero = _mm256_setzero_si256();
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+  // 01 11 21 33 41 51 61 71  xx xx xx xx xx xx xx xx
+  // 02 12 22 33 42 52 62 72  xx xx xx xx xx xx xx xx
+  // 03 13 23 33 43 53 63 73  xx xx xx xx xx xx xx xx
+  // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+  // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+  // 06 16 26 36 46 56 66 76  xx xx xx xx xx xx xx xx
+  // 07 17 27 37 47 57 67 77  xx xx xx xx xx xx xx xx
+  const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
+  const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
+  const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
+  const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
+  const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
+  const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
+  const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
+  const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
+  const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+  const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+  partial[2] =
+      _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+                            _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+  const __m256i extend_reverse = SetrM128i(
+      _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
+                    static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
+      _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
+                    static_cast<int>(0x80048005),
+                    static_cast<int>(0x80068007)));
+
+  for (auto& i : v_src) {
+    // Zero extend unsigned 8 to 16. The upper lane is reversed.
+    i = _mm256_shuffle_epi8(i, extend_reverse);
+  }
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  xx xx xx xx xx xx xx xx
+  // 10 11 12 13 14 15 16 17  xx xx xx xx xx xx xx xx
+  // 20 21 22 23 24 25 26 27  xx xx xx xx xx xx xx xx
+  // 30 31 32 33 34 35 36 37  xx xx xx xx xx xx xx xx
+  // 40 41 42 43 44 45 46 47  xx xx xx xx xx xx xx xx
+  // 50 51 52 53 54 55 56 57  xx xx xx xx xx xx xx xx
+  // 60 61 62 63 64 65 66 67  xx xx xx xx xx xx xx xx
+  // 70 71 72 73 74 75 76 77  xx xx xx xx xx xx xx xx
+  partial[6] = v_src[0];
+  for (int i = 1; i < 8; ++i) {
+    partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
+  }
+
+  AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
+  AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
+  AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
+}
+
+inline __m256i SumVectorPair_S32(__m256i a) {
+  a = _mm256_hadd_epi32(a, a);
+  a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
+  return a;
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
+                          const __m256i partial_4,
+                          const __m256i division_table) {
+  const __m256i division_table_0 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x0);
+  const __m256i division_table_1 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x11);
+
+  // partial_lo
+  const __m256i a = partial_0;
+  // partial_hi
+  const __m256i b = partial_4;
+
+  // Reverse and clear upper 2 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
+      static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
+
+  // 14 13 12 11 10 09 08 ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 14 01 13 02 12 03 11
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 10 05 09 06 08 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[0] = _mm_cvtsi128_si32(sums);
+  cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+template <int index_a, int index_b>
+inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
+                         const __m256i partial_b,
+                         const __m256i division_table[2]) {
+  // partial_lo
+  const __m256i a = partial_a;
+  // partial_hi
+  const __m256i b = partial_b;
+
+  // Reverse and clear upper 10 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504));
+
+  // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 10 01 09 02 08 03 ZZ
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[index_a] = _mm_cvtsi128_si32(sums);
+  cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
+                           const __m256i partial_b,
+                           const __m256i division_table) {
+  // The upper lane is a "don't care", so only use the lower lane for
+  // calculating cost.
+  const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
+
+  const __m256i square_a = _mm256_madd_epi16(a, a);
+  const __m256i b = _mm256_mullo_epi32(square_a, division_table);
+  const __m256i c = SumVectorPair_S32(b);
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
+  cost[2] = _mm_cvtsi128_si32(sums);
+  cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+void CdefDirection_AVX2(const void* LIBGAV1_RESTRICT const source,
+                        ptrdiff_t stride,
+                        uint8_t* LIBGAV1_RESTRICT const direction,
+                        int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+
+  // partial[0] = add partial 0,4 low
+  // partial[1] = add partial 1,3 low
+  // partial[2] = add partial 2 low
+  // partial[3] = add partial 1,3 high
+  // partial[4] = add partial 0,4 high
+  // partial[5] = add partial 7,5 high
+  // partial[6] = add partial 6 low
+  // partial[7] = add partial 7,5 low
+  __m256i partial[8];
+
+  AddPartial(src, stride, partial);
+
+  const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
+  const __m256i division_table_7 =
+      _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
+
+  Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
+
+  Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
+
+  const __m256i division_table_odd[2] = {
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
+
+  CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
+  CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                          const ptrdiff_t stride, __m128i* output,
+                          const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+  output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+  output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+  output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, __m128i* output,
+                    const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+                      src - y_0 * stride + stride - x_0);
+  output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+                      src + y_0 * stride + stride + x_0);
+  output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+                      src - y_1 * stride + stride - x_1);
+  output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+                      src + y_1 * stride + stride + x_1);
+}
+
+inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
+                         const __m128i& damping, const __m256i& threshold) {
+  const __m256i diff = _mm256_sub_epi16(pixel, reference);
+  const __m256i abs_diff = _mm256_abs_epi16(diff);
+  // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+  //                    0, std::abs(diff))
+  const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const __m256i thresh_minus_shifted_diff =
+      _mm256_subs_epu16(threshold, shifted_diff);
+  const __m256i clamp_abs_diff =
+      _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return _mm256_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
+                                    const __m256i& tap, const __m128i& damping,
+                                    const __m256i& threshold) {
+  const __m256i constrained = Constrain(val, pixel, damping, threshold);
+  return _mm256_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_AVX2(const uint16_t* LIBGAV1_RESTRICT src,
+                     const ptrdiff_t src_stride, const int height,
+                     const int primary_strength, const int secondary_strength,
+                     const int damping, const int direction,
+                     void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+  }
+  const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
+  const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
+  const __m256i secondary_tap_0 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
+  const __m256i secondary_tap_1 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
+  const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
+  const __m256i primary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
+  const __m256i secondary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
+
+  int y = height;
+  do {
+    __m128i pixel_128;
+    if (width == 8) {
+      pixel_128 = LoadUnaligned16(src);
+    } else {
+      pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
+    }
+
+    __m256i pixel = SetrM128i(pixel_128, pixel_128);
+
+    __m256i min = pixel;
+    __m256i max = pixel;
+    __m256i sum_pair;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val_128[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val_128, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val_128, direction);
+      }
+
+      __m256i primary_val[2];
+      primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
+      primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, primary_val[0]);
+        min = _mm256_min_epu16(min, primary_val[1]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
+        max = _mm256_max_epu16(
+            max, _mm256_and_si256(max_p01, cdef_large_value_mask));
+      }
+
+      sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                      primary_damping_shift, primary_threshold);
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
+                               primary_damping_shift, primary_threshold));
+    } else {
+      sum_pair = _mm256_setzero_si256();
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val_128[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
+      }
+
+      __m256i secondary_val[4];
+      secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
+      secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
+      secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
+      secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, secondary_val[0]);
+        min = _mm256_min_epu16(min, secondary_val[1]);
+        min = _mm256_min_epu16(min, secondary_val[2]);
+        min = _mm256_min_epu16(min, secondary_val[3]);
+
+        const __m256i max_s01 =
+            _mm256_max_epu8(secondary_val[0], secondary_val[1]);
+        const __m256i max_s23 =
+            _mm256_max_epu8(secondary_val[2], secondary_val[3]);
+        const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
+        max = _mm256_max_epu8(max,
+                              _mm256_and_si256(max_s, cdef_large_value_mask));
+      }
+
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
+
+    __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
+                                _mm256_extracti128_si256(sum_pair, 1));
+
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+    // 8 + sum
+    sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    // (... - (sum < 0)) >> 4
+    sum = _mm_add_epi16(sum, sum_lt_0);
+    sum = _mm_srai_epi16(sum, 4);
+    // pixel + ...
+    sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
+    if (clipping_required) {
+      const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
+                                            _mm256_extracti128_si256(min, 1));
+
+      const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
+                                            _mm256_extracti128_si256(max, 1));
+      // Clip3
+      sum = _mm_min_epi16(sum, max_128);
+      sum = _mm_max_epi16(sum, min_128);
+    }
+
+    const __m128i result = _mm_packus_epi16(sum, sum);
+    if (width == 8) {
+      src += src_stride;
+      StoreLo8(dst, result);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      Store4(dst, result);
+      dst += dst_stride;
+      Store4(dst, _mm_srli_si128(result, 4));
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_AVX2;
+
+  dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/cdef_avx2.h b/src/dsp/x86/cdef_avx2.h
new file mode 100644 (file)
index 0000000..41f2d3f
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
new file mode 100644 (file)
index 0000000..6c48844
--- /dev/null
@@ -0,0 +1,734 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
+    420, 210, 140, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
+                                            __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  // 00 01 02 03 04 05 06 07
+  *partial_lo = v_src_16[0];
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm_setzero_si128();
+
+  // 00 10 11 12 13 14 15 16
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
+  // 17 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
+  // 26 27 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
+                                            __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  __m128i v_d1_temp[8];
+  const __m128i v_zero = _mm_setzero_si128();
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = v_zero;
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  __m128i v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
+  v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
+  v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
+  v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm_setzero_si128();
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+                                      ptrdiff_t stride, __m128i* partial_lo,
+                                      __m128i* partial_hi) {
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  __m128i v_src[8];
+  for (auto& i : v_src) {
+    i = LoadLo8(src);
+    src += stride;
+  }
+
+  const __m128i v_zero = _mm_setzero_si128();
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
+  // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
+  // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
+  // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
+  // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
+  // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
+  // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
+  // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
+  const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
+  const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
+  const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
+  const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
+  const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
+  const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
+  const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
+  const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
+  const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+  const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+  partial_lo[2] =
+      _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+                         _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+  __m128i v_src_16[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
+  }
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
+  // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
+  // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
+  // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
+  // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
+  // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
+  // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
+  // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
+  partial_lo[6] = v_src_16[0];
+  for (int i = 1; i < 8; ++i) {
+    partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
+  }
+
+  // partial for direction 0
+  AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
+
+  // partial for direction 1
+  AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
+
+  // partial for direction 7
+  AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
+
+  __m128i v_src_reverse[8];
+  const __m128i reverser =
+      _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+  for (int i = 0; i < 8; ++i) {
+    v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
+  }
+
+  // partial for direction 4
+  AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+  // partial for direction 3
+  AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+  // partial for direction 5
+  AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+inline uint32_t SumVector_S32(__m128i a) {
+  a = _mm_hadd_epi32(a, a);
+  a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
+  return _mm_cvtsi128_si32(a);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
+                         const __m128i division_table[2]) {
+  // Reverse and clear upper 2 bytes.
+  const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
+                                         0x03020504, 0x07060908, 0x0b0a0d0c);
+  // 14 13 12 11 10 09 08 ZZ
+  const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+  // 00 14 01 13 02 12 03 11
+  const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+  // 04 10 05 09 06 08 07 ZZ
+  const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+  const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+  const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+  const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+  return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+inline uint32_t CostOdd(const __m128i a, const __m128i b,
+                        const __m128i division_table[2]) {
+  // Reverse and clear upper 10 bytes.
+  const __m128i reverser =
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504);
+  // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+  const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+  // 00 10 01 09 02 08 03 ZZ
+  const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+  // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+  const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][10 - i])
+  const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+  const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+  const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+  const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+  return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+// Sum of squared elements.
+inline uint32_t SquareSum_S16(const __m128i a) {
+  const __m128i square = _mm_madd_epi16(a, a);
+  return SumVector_S32(square);
+}
+
+void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source,
+                          ptrdiff_t stride,
+                          uint8_t* LIBGAV1_RESTRICT const direction,
+                          int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+  __m128i partial_lo[8], partial_hi[8];
+
+  AddPartial(src, stride, partial_lo, partial_hi);
+
+  cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+  cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+
+  const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
+                                     LoadUnaligned16(kCdefDivisionTable + 4)};
+
+  cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+  cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+  const __m128i division_table_odd[2] = {
+      LoadAligned16(kCdefDivisionTableOddPadded),
+      LoadAligned16(kCdefDivisionTableOddPadded + 4)};
+
+  cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
+  cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
+  cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
+  cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                          const ptrdiff_t stride, __m128i* output,
+                          const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+  output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+  output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+  output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, __m128i* output,
+                    const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+                      src - y_0 * stride + stride - x_0);
+  output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+                      src + y_0 * stride + stride + x_0);
+  output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+                      src - y_1 * stride + stride - x_1);
+  output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+                      src + y_1 * stride + stride + x_1);
+}
+
+inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
+                         const __m128i& damping, const __m128i& threshold) {
+  const __m128i diff = _mm_sub_epi16(pixel, reference);
+  const __m128i abs_diff = _mm_abs_epi16(diff);
+  // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+  //                    0, std::abs(diff))
+  const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const __m128i thresh_minus_shifted_diff =
+      _mm_subs_epu16(threshold, shifted_diff);
+  const __m128i clamp_abs_diff =
+      _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return _mm_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
+                                    const __m128i& tap, const __m128i& damping,
+                                    const __m128i& threshold) {
+  const __m128i constrained = Constrain(val, pixel, damping, threshold);
+  return _mm_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride, const int height,
+                       const int primary_strength, const int secondary_strength,
+                       const int damping, const int direction,
+                       void* LIBGAV1_RESTRICT dest,
+                       const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+  }
+
+  const __m128i primary_tap_0 =
+      _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
+  const __m128i primary_tap_1 =
+      _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
+  const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
+  const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
+  const __m128i cdef_large_value_mask =
+      _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
+  const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
+  const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
+
+  int y = height;
+  do {
+    __m128i pixel;
+    if (width == 8) {
+      pixel = LoadUnaligned16(src);
+    } else {
+      pixel = LoadHi8(LoadLo8(src), src + src_stride);
+    }
+
+    __m128i min = pixel;
+    __m128i max = pixel;
+    __m128i sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      if (clipping_required) {
+        min = _mm_min_epu16(min, primary_val[0]);
+        min = _mm_min_epu16(min, primary_val[1]);
+        min = _mm_min_epu16(min, primary_val[2]);
+        min = _mm_min_epu16(min, primary_val[3]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+        const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+        const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+        max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+      }
+
+      sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                 primary_damping_shift, primary_threshold);
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
+    } else {
+      sum = _mm_setzero_si128();
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      if (clipping_required) {
+        min = _mm_min_epu16(min, secondary_val[0]);
+        min = _mm_min_epu16(min, secondary_val[1]);
+        min = _mm_min_epu16(min, secondary_val[2]);
+        min = _mm_min_epu16(min, secondary_val[3]);
+        min = _mm_min_epu16(min, secondary_val[4]);
+        min = _mm_min_epu16(min, secondary_val[5]);
+        min = _mm_min_epu16(min, secondary_val[6]);
+        min = _mm_min_epu16(min, secondary_val[7]);
+
+        const __m128i max_s01 =
+            _mm_max_epu8(secondary_val[0], secondary_val[1]);
+        const __m128i max_s23 =
+            _mm_max_epu8(secondary_val[2], secondary_val[3]);
+        const __m128i max_s45 =
+            _mm_max_epu8(secondary_val[4], secondary_val[5]);
+        const __m128i max_s67 =
+            _mm_max_epu8(secondary_val[6], secondary_val[7]);
+        const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+                                           _mm_max_epu8(max_s45, max_s67));
+        max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+      }
+
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+    // 8 + sum
+    sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    // (... - (sum < 0)) >> 4
+    sum = _mm_add_epi16(sum, sum_lt_0);
+    sum = _mm_srai_epi16(sum, 4);
+    // pixel + ...
+    sum = _mm_add_epi16(sum, pixel);
+    if (clipping_required) {
+      // Clip3
+      sum = _mm_min_epi16(sum, max);
+      sum = _mm_max_epi16(sum, min);
+    }
+
+    const __m128i result = _mm_packus_epi16(sum, sum);
+    if (width == 8) {
+      src += src_stride;
+      StoreLo8(dst, result);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      Store4(dst, result);
+      dst += dst_stride;
+      Store4(dst, _mm_srli_si128(result, 4));
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_SSE4_1;
+  dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/cdef_sse4.h b/src/dsp/x86/cdef_sse4.h
new file mode 100644 (file)
index 0000000..6631eb7
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
new file mode 100644 (file)
index 0000000..373116a
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+namespace libgav1 {
+namespace dsp {
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+}  // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_AVX2
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
diff --git a/src/dsp/x86/common_avx2.inc b/src/dsp/x86/common_avx2.inc
new file mode 100644 (file)
index 0000000..53b4e2e
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+  // For compatibility with older gcc toolchains (< 8) use
+  // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+  // are implemented similarly to the following, clang uses a different method
+  // but no differences in assembly have been observed.
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+  dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+  return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m256i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+  if (over_read_in_bytes > 0) {
+    __m128i m = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+      m = _mm_srli_si128(m, 1);
+    }
+    const __m256i mask = (over_read_in_bytes < 16)
+                             ? SetrM128i(_mm_set1_epi8(-1), m)
+                             : SetrM128i(m, _mm_setzero_si128());
+    dst = _mm256_and_si256(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+                              const ptrdiff_t over_read_in_bytes,
+                              __m256i dst[2]) {
+  dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+  dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+                         over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+  _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+  assert(bits <= 16);
+  const __m256i v_bias_d =
+      _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+  return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) {
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d);
+  return _mm256_srai_epi32(v_tmp_d, bits);
+}
diff --git a/src/dsp/x86/common_avx2_test.cc b/src/dsp/x86/common_avx2_test.cc
new file mode 100644 (file)
index 0000000..4b294b0
--- /dev/null
@@ -0,0 +1,73 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_avx2_test.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <cstdint>
+
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+void AVX2RightShiftWithRoundingS16Test() {
+  for (int bits = 0; bits < 16; ++bits) {
+    const int bias = (1 << bits) >> 1;
+    for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+      const __m256i v_val_d = _mm256_set1_epi16(value);
+      const __m256i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+      // Note _mm256_extract_epi16 is avoided for compatibility with Visual
+      // Studio < 2017.
+      const int16_t result =
+          _mm_extract_epi16(_mm256_extracti128_si256(v_result_d, 0), 0);
+      const int32_t expected = RightShiftWithRounding(value, bits);
+      if (value <= INT16_MAX - bias) {
+        EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+      } else {
+        EXPECT_EQ(expected, 1 << (15 - bits));
+        EXPECT_EQ(result, -expected)
+            << "value: " << value << ", bits: " << bits;
+      }
+    }
+  }
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_AVX2
+
+namespace libgav1 {
+namespace dsp {
+
+void AVX2RightShiftWithRoundingS16Test() {
+  GTEST_SKIP() << "Build this module for x86(-64) with AVX2 enabled to enable "
+                  "the tests.";
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/common_avx2_test.h b/src/dsp/x86/common_avx2_test.h
new file mode 100644 (file)
index 0000000..1124f7f
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
+
+namespace libgav1 {
+namespace dsp {
+
+void AVX2RightShiftWithRoundingS16Test();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h
new file mode 100644 (file)
index 0000000..41a3a68
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#if 0
+#include <cinttypes>
+#include <cstdio>
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintReg(const __m128i r, const char* const name, int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  fprintf(stderr, "%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n)
+      fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
+  }
+  fprintf(stderr, "\n");
+}
+
+inline void PrintReg(const int r, const char* const name) {
+  fprintf(stderr, "%s: %d\n", name, r);
+}
+
+inline void PrintRegX(const int r, const char* const name) {
+  fprintf(stderr, "%s: %.8x\n", name, r);
+}
+
+#define PR(var, N) PrintReg(var, #var, N)
+#define PD(var) PrintReg(var, #var);
+#define PX(var) PrintRegX(var, #var);
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+                        const size_t size) {
+  fprintf(stderr, "Shadow for %s:\n", name);
+  __msan_print_shadow(r, size);
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif  // LIBGAV1_MSAN
+
+#endif  // 0
+
+namespace libgav1 {
+namespace dsp {
+namespace sse4 {
+
+#include "src/dsp/x86/common_sse4.inc"
+
+}  // namespace sse4
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+using sse4::Load2;
+using sse4::Load2x2;
+using sse4::Load4;
+using sse4::Load4x2;
+using sse4::LoadAligned16;
+using sse4::LoadAligned16Msan;
+using sse4::LoadHi8;
+using sse4::LoadHi8Msan;
+using sse4::LoadLo8;
+using sse4::LoadLo8Msan;
+using sse4::LoadUnaligned16;
+using sse4::LoadUnaligned16Msan;
+using sse4::MaskHighNBytes;
+using sse4::RightShiftWithRounding_S16;
+using sse4::RightShiftWithRounding_S32;
+using sse4::RightShiftWithRounding_U16;
+using sse4::RightShiftWithRounding_U32;
+using sse4::Store2;
+using sse4::Store4;
+using sse4::StoreAligned16;
+using sse4::StoreHi8;
+using sse4::StoreLo8;
+using sse4::StoreUnaligned16;
+// NOLINTEND
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
diff --git a/src/dsp/x86/common_sse4.inc b/src/dsp/x86/common_sse4.inc
new file mode 100644 (file)
index 0000000..35c56b8
--- /dev/null
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+  int16_t val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+  uint16_t val1;
+  uint16_t val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+  int16_t temp;
+  memcpy(&temp, buf, 2);
+  return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val1, val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+  const __m128 x =
+      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+  return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m128i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    __m128i mask = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+      mask = _mm_srli_si128(mask, 1);
+    }
+    dst = _mm_and_si128(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+  assert(bits <= 16);
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+  assert(bits < 16);
+  const __m128i v_bias_d =
+      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// Use this when |bits| is not an immediate value.
+inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
+                                                  int bits) {
+  const __m128i v_bias_d =
+      _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+  static constexpr uint8_t kMask[32] = {
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  };
+
+  return LoadUnaligned16(kMask + n);
+}
diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc
new file mode 100644 (file)
index 0000000..592630c
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_sse4_test.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <cstdint>
+
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+void SSE41RightShiftWithRoundingS16Test() {
+  for (int bits = 0; bits < 16; ++bits) {
+    const int bias = (1 << bits) >> 1;
+    for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+      const __m128i v_val_d = _mm_set1_epi16(value);
+      const __m128i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+      const int16_t result = _mm_extract_epi16(v_result_d, 0);
+      const int32_t expected = RightShiftWithRounding(value, bits);
+      if (value <= INT16_MAX - bias) {
+        EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+      } else {
+        EXPECT_EQ(expected, 1 << (15 - bits));
+        EXPECT_EQ(result, -expected)
+            << "value: " << value << ", bits: " << bits;
+      }
+    }
+  }
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SSE41RightShiftWithRoundingS16Test() {
+  GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
+                  "the tests.";
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/common_sse4_test.h b/src/dsp/x86/common_sse4_test.h
new file mode 100644 (file)
index 0000000..169439a
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
+
+namespace libgav1 {
+namespace dsp {
+
+void SSE41RightShiftWithRoundingS16Test();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
new file mode 100644 (file)
index 0000000..ff51aee
--- /dev/null
@@ -0,0 +1,1496 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int num_taps>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+  __m256i sum;
+  if (num_taps == 6) {
+    // 6 taps.
+    const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm256_add_epi16(sum, v_madd_65);
+  } else if (num_taps == 8) {
+    // 8 taps.
+    const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+    const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (num_taps == 2) {
+    // 2 taps.
+    sum = _mm256_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int num_taps>
+__m256i SumHorizontalTaps(const __m256i* const src,
+                          const __m256i* const v_tap) {
+  __m256i v_src[4];
+  const __m256i src_long = *src;
+  const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+  const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+  if (num_taps == 6) {
+    // 6 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (num_taps == 8) {
+    // 8 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (num_taps == 2) {
+    // 2 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else {
+    // 4 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  return SumOnePassTaps<num_taps>(v_src, v_tap);
+}
+
+template <int num_taps>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+                             const __m256i* const v_tap) {
+  __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm256_packus_epi16(sum, sum);
+}
+
+template <int num_taps>
+__m256i HorizontalTaps8To16(const __m256i* const src,
+                            const __m256i* const v_tap) {
+  const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, bool is_2d = false, bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int /*width*/,
+                      const int height, const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // Horizontal passes only need to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (!is_compound) {
+      int y = height;
+      if (is_2d) y -= 1;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y -= 2;
+      } while (y != 0);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (num_taps == 2) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+// Filter widths >= 4.
+template <int num_taps, bool is_2d = false, bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const __m256i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  if (width >= 32) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          // Load into 2 128 bit lanes.
+          const __m256i src_long =
+              SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
+          const __m256i result =
+              HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+          const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
+                                              LoadUnaligned16(&src[x + 24]));
+          const __m256i result2 =
+              HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
+          if (is_2d) {
+            StoreAligned32(&dest16[x], result);
+            StoreAligned32(&dest16[x + 16], result2);
+          } else {
+            StoreUnaligned32(&dest16[x], result);
+            StoreUnaligned32(&dest16[x + 16], result2);
+          }
+        } else {
+          // Load src used to calculate dest8[7:0] and dest8[23:16].
+          const __m256i src_long = LoadUnaligned32(&src[x]);
+          const __m256i result =
+              SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+          // Load src used to calculate dest8[15:8] and dest8[31:24].
+          const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+          const __m256i result2 =
+              SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
+          // Combine results and store.
+          StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+        }
+        x += 32;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+  } else if (width == 16) {
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      if (is_2d || is_compound) {
+        // Load into 2 128 bit lanes.
+        const __m256i src_long =
+            SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+        const __m256i src_long2 =
+            SetrM128i(LoadUnaligned16(&src[src_stride]),
+                      LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
+        if (is_2d) {
+          StoreAligned32(&dest16[0], result);
+          StoreAligned32(&dest16[pred_stride], result2);
+        } else {
+          StoreUnaligned32(&dest16[0], result);
+          StoreUnaligned32(&dest16[pred_stride], result2);
+        }
+      } else {
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+                                           LoadUnaligned16(&src[src_stride]));
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+        const __m256i src_long2 = SetrM128i(
+            LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
+        const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+        StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+        StoreUnaligned16(&dest8[pred_stride],
+                         _mm256_extracti128_si256(packed_result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long =
+          SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+      StoreAligned32(&dest16[0], result);
+    }
+
+  } else if (width == 8) {
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
+      if (is_2d || is_compound) {
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+        if (is_2d) {
+          StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreAligned16(&dest16[pred_stride],
+                         _mm256_extracti128_si256(result, 1));
+        } else {
+          StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreUnaligned16(&dest16[pred_stride],
+                           _mm256_extracti128_si256(result, 1));
+        }
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+        StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+      StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+    }
+
+  } else {  // width == 4
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
+      if (is_2d || is_compound) {
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+        StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+        Store4(&dest8[0], _mm256_castsi256_si128(result));
+        Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+      StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+    }
+  }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m256i* v_tap) {
+  if (num_taps == 8) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(*filter);                      // k1k0
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));   // k3k2
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));   // k5k4
+      v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12));  // k7k6
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(*filter);                     // k1k0
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+      v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6));  // k7k6
+    }
+  } else if (num_taps == 6) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2));   // k2k1
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));   // k4k3
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10));  // k6k5
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1));  // k2k1
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5));  // k6k5
+    }
+  } else if (num_taps == 4) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));  // k3k2
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));  // k5k4
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+    }
+  } else {  // num_taps == 2
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));  // k4k3
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m256i SimpleSum2DVerticalTaps(const __m256i* const src,
+                                const __m256i* const taps) {
+  __m256i sum_lo =
+      _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m256i sum_hi =
+      _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m256i madd_lo =
+        _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m256i madd_hi =
+        _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo =
+          _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi =
+          _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo =
+            _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi =
+            _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm256_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm256_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical16xH(const uint16_t* LIBGAV1_RESTRICT src,
+                          void* LIBGAV1_RESTRICT const dst,
+                          const ptrdiff_t dst_stride, const int width,
+                          const int height, const __m256i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m256i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned32(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        const __m128i packed_sum = _mm_packus_epi16(
+            _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        StoreUnaligned16(dst8_x, packed_sum);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 16;
+  } while (x < width);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  assert(filter_id != 0);
+  __m256i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  }
+}
+
+void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* LIBGAV1_RESTRICT prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  if (width > 2) {
+    DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
+                                     width, width, intermediate_height,
+                                     horizontal_filter_id, horiz_filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH</*is_2d=*/true>(
+        src, src_stride, intermediate_result, width, width, intermediate_height,
+        horizontal_filter_id, horiz_filter_index);
+  }
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    }
+  }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m256i Compound1DShift(const __m256i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool unpack_high = false>
+__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
+  __m256i v_src[4];
+
+  if (!unpack_high) {
+    if (num_taps == 6) {
+      // 6 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+    } else if (num_taps == 8) {
+      // 8 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
+    } else if (num_taps == 2) {
+      // 2 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+    } else {
+      // 4 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+    }
+  } else {
+    if (num_taps == 6) {
+      // 6 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+    } else if (num_taps == 8) {
+      // 8 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
+    } else if (num_taps == 2) {
+      // 2 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+    } else {
+      // 4 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+    }
+  }
+  return SumOnePassTaps<num_taps>(v_src, v_tap);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
+                        const ptrdiff_t src_stride,
+                        void* LIBGAV1_RESTRICT const dst,
+                        const ptrdiff_t dst_stride, const int width,
+                        const int height, const __m256i* const v_tap) {
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 32);
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m256i srcs[8];
+    srcs[0] = LoadUnaligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      const __m256i sums_hi =
+          SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
+      if (is_compound) {
+        const __m256i results =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+        const __m256i results_hi =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+        StoreUnaligned32(dst16_x, results);
+        StoreUnaligned32(dst16_x + 16, results_hi);
+        dst16_x += dst_stride;
+      } else {
+        const __m256i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m256i results_hi =
+            RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+        const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+
+        StoreUnaligned32(dst8_x, packed_results);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 32;
+  } while (x < width);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
+                        const ptrdiff_t src_stride,
+                        void* LIBGAV1_RESTRICT const dst,
+                        const ptrdiff_t dst_stride, const int /*width*/,
+                        const int height, const __m256i* const v_tap) {
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+    const __m256i sums_hi =
+        SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+      const __m256i results_hi =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+
+      StoreUnaligned32(dst16, results);
+      StoreUnaligned32(dst16 + dst_stride, results_hi);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i results_hi =
+          RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreUnaligned16(dst8, this_dst);
+      StoreUnaligned16(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int /*width*/,
+                       const int height, const __m256i* const v_tap) {
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results = Compound1DShift(sums);
+      const __m128i this_dst = _mm256_castsi256_si128(results);
+      const auto next_dst = _mm256_extracti128_si256(results, 1);
+
+      StoreUnaligned16(dst16, this_dst);
+      StoreUnaligned16(dst16 + dst_stride, next_dst);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreLo8(dst8, this_dst);
+      StoreLo8(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int /*width*/,
+                       const int height, const __m128i* const v_tap) {
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m128i srcs[8];
+  srcs[0] = LoadLo8(src_x);
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = LoadLo8(src_x);
+    src_x += src_stride;
+    srcs[2] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 6) {
+      srcs[3] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[4] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps == 8) {
+        srcs[5] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[6] = LoadLo8(src_x);
+        src_x += src_stride;
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadLo8(src_x);
+    src_x += src_stride;
+
+    const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+    if (is_compound) {
+      const __m128i results = Compound1DShift(sums);
+      StoreUnaligned16(dst16, results);
+      dst16 += dst_stride;
+    } else {
+      const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      StoreLo8(dst8, _mm_packus_epi16(results, results));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[1];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[2];
+      srcs[2] = srcs[3];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[4];
+        srcs[4] = srcs[5];
+        if (num_taps == 8) {
+          srcs[5] = srcs[6];
+          srcs[6] = srcs[7];
+        }
+      }
+    }
+  } while (--y != 0);
+}
+
+void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
+                           const ptrdiff_t reference_stride,
+                           const int /*horizontal_filter_index*/,
+                           const int vertical_filter_index,
+                           const int /*horizontal_filter_id*/,
+                           const int vertical_filter_id, const int width,
+                           const int height, void* LIBGAV1_RESTRICT prediction,
+                           const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (vertical_taps == 6) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (vertical_taps == 8) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (vertical_taps == 2) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    }
+  } else {  // width <= 8
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (vertical_taps == 6) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
+      } else {
+        FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
+      }
+    } else if (vertical_taps == 8) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
+      } else {
+        FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
+      }
+    } else if (vertical_taps == 2) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      } else {
+        FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      }
+    } else {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+      } else {
+        FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+      }
+    }
+  }
+}
+
+void ConvolveCompoundVertical_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = width;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (vertical_taps == 6) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<6, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<6, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<6, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (vertical_taps == 8) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<8, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<8, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<8, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (vertical_taps == 2) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    }
+  } else {  // width <= 4
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (vertical_taps == 6) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else if (vertical_taps == 8) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else if (vertical_taps == 2) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    }
+  }
+}
+
+void ConvolveHorizontal_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width > 2) {
+    DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                     horizontal_filter_id, filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+                        horizontal_filter_id, filter_index);
+  }
+}
+
+void ConvolveCompoundHorizontal_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+
+#ifdef NDEBUG
+  // Quiet compiler error.
+  (void)pred_stride;
+#endif
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<8, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<6, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<4, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<2, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
+  dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
+
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h
new file mode 100644 (file)
index 0000000..e509bc9
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
new file mode 100644 (file)
index 0000000..99b87d6
--- /dev/null
@@ -0,0 +1,1905 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+template <int num_taps>
+__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
+                          const __m128i* const v_tap) {
+  __m128i v_src[4];
+  const __m128i src_long = LoadUnaligned16(src);
+  const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
+  const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
+
+  if (num_taps == 6) {
+    // 6 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (num_taps == 8) {
+    // 8 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (num_taps == 2) {
+    // 2 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else {
+    // 4 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+  return sum;
+}
+
+template <int num_taps>
+__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
+                             const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int num_taps>
+__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
+                            const __m128i* const v_tap) {
+  const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d = false, bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // 4 tap filters are never used when width > 4.
+  if (num_taps != 4 && width > 4) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
+          if (is_2d) {
+            StoreAligned16(&dest16[x], v_sum);
+          } else {
+            StoreUnaligned16(&dest16[x], v_sum);
+          }
+        } else {
+          const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
+          StoreLo8(&dest8[x], result);
+        }
+        x += 8;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+    return;
+  }
+
+  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (width == 4) {
+      int y = height;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
+          StoreLo8(dest16, v_sum);
+        } else {
+          const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
+          Store4(&dest8[0], result);
+        }
+        src += src_stride;
+        dest8 += pred_stride;
+        dest16 += pred_stride;
+      } while (--y != 0);
+      return;
+    }
+
+    if (!is_compound) {
+      int y = height;
+      if (is_2d) y -= 1;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y -= 2;
+      } while (y != 0);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (num_taps == 2) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  }
+}
+
+void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index,
+                       const int horizontal_filter_id,
+                       const int vertical_filter_id, const int width,
+                       const int height, void* LIBGAV1_RESTRICT prediction,
+                       const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  }
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const __m128i* const v_tap) {
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m128i srcs[8];
+    srcs[0] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadLo8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadLo8(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadLo8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadLo8(src_x);
+      src_x += src_stride;
+
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16_x, results);
+        dst16_x += dst_stride;
+      } else {
+        const __m128i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        StoreLo8(dst8_x, _mm_packus_epi16(results, results));
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+void ConvolveVertical_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 6) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (vertical_taps == 8) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (vertical_taps == 2) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else {  // 4 tap
+    SetupTaps<4>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int kRoundBitsVertical =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&src[x]);
+        const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
+        const __m128i v_src_ext_hi =
+            _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
+        const __m128i v_dest_lo =
+            _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
+        const __m128i v_dest_hi =
+            _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
+        StoreUnaligned16(&dest[x], v_dest_lo);
+        StoreUnaligned16(&dest[x + 8], v_dest_hi);
+        x += 16;
+      } while (x < width);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i v_src = LoadLo8(&src[0]);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreUnaligned16(&dest[0], v_dest);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else { /* width == 4 */
+    int y = height;
+    do {
+      const __m128i v_src0 = Load4(&src[0]);
+      const __m128i v_src1 = Load4(&src[src_stride]);
+      const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreLo8(&dest[0], v_dest);
+      StoreHi8(&dest[pred_stride], v_dest);
+      src += src_stride * 2;
+      dest += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void ConvolveCompoundVertical_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 6) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (vertical_taps == 8) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (vertical_taps == 2) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else {  // 4 tap
+    SetupTaps<4>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  }
+}
+
+void ConvolveHorizontal_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  const ptrdiff_t dest_stride = width;
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+  // Filter 0
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+      {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+       {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+       {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  // Filter 1
+  alignas(16) static constexpr int8_t
+      kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+          {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+          {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+  // Filter 2
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+      {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+       {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+       {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+       {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+       {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+       {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+       {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+       {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+  // Filter 3
+  alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+  // Filter 4
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+      {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+  // Filter 5
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+  switch (filter_index) {
+    case 0:
+      output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+      break;
+    case 1:
+      // The term "mixed" refers to the fact that the outer taps have a mix of
+      // negative and positive values.
+      output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+      break;
+    case 2:
+      output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+      output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+      output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+      break;
+    case 3:
+      output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+      break;
+    case 4:
+      output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+      break;
+    default:
+      assert(filter_index == 5);
+      output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+      output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+      output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+      output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+      break;
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src,
+                                 const __m128i src_indices,
+                                 __m128i* const source /*[num_taps >> 1]*/) {
+  // |used_bytes| is only computed in msan builds. Mask away unused bytes for
+  // msan because it incorrectly models the outcome of the shuffles in some
+  // cases. This has not been reproduced out of context.
+  const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
+  const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
+  source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+  if (grade_x == 1) {
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+    }
+    if (num_taps > 4) {
+      source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+    }
+  } else {
+    assert(grade_x > 1);
+    assert(num_taps != 4);
+    // grade_x > 1 also means width >= 8 && num_taps != 4
+    const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+                                   src_indices);
+      source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+                                   src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+                                   src_indices);
+    }
+  }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+                                  const __m128i* filter_taps,
+                                  __m128i* out_taps) {
+  const __m128i scale_index_offsets =
+      _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+  const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+  const __m128i filter_indices =
+      _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+                    filter_index_mask);
+  // Line up taps for maddubs_epi16.
+  // The unpack is also assumed to be lighter than shift+alignr.
+  for (int k = 0; k < (num_taps >> 1); ++k) {
+    const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+    const __m128i taps1 =
+        _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+    out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+  }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+  const __m128i src_indices16 =
+      _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+  const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+  return _mm_unpacklo_epi8(src_indices,
+                           _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
+                                    ptrdiff_t src_stride, int width,
+                                    int subpixel_x, int step_x,
+                                    int intermediate_height,
+                                    int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = (8 - num_taps) >> 1;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  __m128i filter_taps[num_taps];
+  GetHalfSubPixelFilter<filter_index>(filter_taps);
+  const __m128i index_steps =
+      _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+                      _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+  __m128i taps[num_taps >> 1];
+  __m128i source[num_taps >> 1];
+  int p = subpixel_x;
+  // Case when width <= 4 is possible.
+  if (filter_index >= 3) {
+    if (filter_index > 3 || width <= 4) {
+      const uint8_t* src_x =
+          &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+      // Only add steps to the 10-bit truncated p to avoid overflow.
+      const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+      const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+      PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+      const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+      int y = intermediate_height;
+      do {
+        // Load and line up source values with the taps. Width 4 means no need
+        // to load extended source.
+        PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+                                                      source);
+
+        StoreLo8(intermediate, RightShiftWithRounding_S16(
+                                   SumOnePassTaps<num_taps>(source, taps),
+                                   kInterRoundBitsHorizontal - 1));
+        src_x += src_stride;
+        intermediate += kIntermediateStride;
+      } while (--y != 0);
+      return;
+    }
+  }
+
+  // |width| >= 8
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+    const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+    PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+    const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+    int y = intermediate_height;
+    do {
+      // For each x, a lane of src_k[k] contains src_x[k].
+      PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+      // Shift by one less because the taps are halved.
+      StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
+                                         SumOnePassTaps<num_taps>(source, taps),
+                                         kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps,
+                                __m128i* output) {
+  // Avoid overreading the filter due to starting at kernel_offset.
+  // The only danger of overread is in the final filter, which has 4 taps.
+  const __m128i filter =
+      _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+  output[0] = _mm_shuffle_epi32(filter, 0);
+  if (num_taps > 2) {
+    output[1] = _mm_shuffle_epi32(filter, 0x55);
+  }
+  if (num_taps > 4) {
+    output[2] = _mm_shuffle_epi32(filter, 0xAA);
+  }
+  if (num_taps > 6) {
+    output[3] = _mm_shuffle_epi32(filter, 0xFF);
+  }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+                                 const __m128i* taps) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+  }
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+                             const __m128i* taps_hi) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src,
+                                  const int intermediate_height,
+                                  const int width, const int subpixel_y,
+                                  const int filter_index, const int step_y,
+                                  const int height, void* LIBGAV1_RESTRICT dest,
+                                  const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  constexpr int kernel_offset = (8 - num_taps) / 2;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  auto* dest16_y = static_cast<uint16_t*>(dest);
+  auto* dest_y = static_cast<uint8_t*>(dest);
+  __m128i s[num_taps];
+
+  int p = subpixel_y & 1023;
+  int y = height;
+  if (width_class <= 4) {
+    __m128i filter_taps_lo[num_taps >> 1];
+    __m128i filter_taps_hi[num_taps >> 1];
+    do {  // y > 0
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadLo8(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter0 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadHi8(s[i], src_y + i * src_stride);
+      }
+      filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter1 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+          s, filter_taps_lo, filter_taps_hi);
+      if (is_compound) {
+        assert(width_class > 2);
+        StoreLo8(dest16_y, sums);
+        dest16_y += dest_stride;
+        StoreHi8(dest16_y, sums);
+        dest16_y += dest_stride;
+      } else {
+        const __m128i result = _mm_packus_epi16(sums, sums);
+        if (width_class == 2) {
+          Store2(dest_y, result);
+          dest_y += dest_stride;
+          Store2(dest_y, _mm_srli_si128(result, 4));
+        } else {
+          Store4(dest_y, result);
+          dest_y += dest_stride;
+          Store4(dest_y, _mm_srli_si128(result, 4));
+        }
+        dest_y += dest_stride;
+      }
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  // |width_class| >= 8
+  __m128i filter_taps[num_taps >> 1];
+  int x = 0;
+  do {  // x < width
+    auto* dest_y = static_cast<uint8_t*>(dest) + x;
+    auto* dest16_y = static_cast<uint16_t*>(dest) + x;
+    int p = subpixel_y & 1023;
+    int y = height;
+    do {  // y > 0
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadUnaligned16(src_y + i * src_stride);
+      }
+
+      const __m128i sums =
+          Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+      if (is_compound) {
+        StoreUnaligned16(dest16_y, sums);
+      } else {
+        StoreLo8(dest_y, _mm_packus_epi16(sums, sums));
+      }
+      p += step_y;
+      dest_y += dest_stride;
+      dest16_y += dest_stride;
+    } while (--y != 0);
+    src += kIntermediateStride * intermediate_height;
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
+                            const ptrdiff_t reference_stride,
+                            const int horizontal_filter_index,
+                            const int vertical_filter_index,
+                            const int subpixel_x, const int subpixel_y,
+                            const int step_x, const int step_y, const int width,
+                            const int height, void* LIBGAV1_RESTRICT prediction,
+                            const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  alignas(16) int16_t
+      intermediate_result[kIntermediateAllocWidth *
+                          (2 * kIntermediateAllocWidth + kSubPixelTaps)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x44, sizeof(intermediate_result));
+#endif
+  const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // second register and alignr in order to gather all filter inputs.
+  // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+  const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (horiz_filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+
+      } else {
+        ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+      break;
+    default:
+      assert(horiz_filter_index == 5);
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+  }
+
+  // Vertical filter.
+  intermediate = intermediate_result;
+  switch (vert_filter_index) {
+    case 0:
+    case 1:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<6, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<6, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<6, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+      break;
+    case 2:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<8, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<8, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<8, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+      break;
+    case 3:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<2, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<2, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<2, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+      break;
+    default:
+      assert(vert_filter_index == 4 || vert_filter_index == 5);
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<4, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<4, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<4, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                              uint8_t* LIBGAV1_RESTRICT dst) {
+  const __m128i left = LoadUnaligned16(src);
+  const __m128i right = LoadUnaligned16(src + 1);
+  StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                                     const ptrdiff_t src_stride,
+                                     const int height,
+                                     uint8_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i left = LoadLo8(src);
+      const __m128i right = LoadLo8(src + 1);
+      StoreLo8(dest, _mm_avg_epu8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 4) {
+    int y = height;
+    do {
+      __m128i left = Load4(src);
+      __m128i right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    assert(width == 2);
+    __m128i left = _mm_setzero_si128();
+    __m128i right = _mm_setzero_si128();
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<1>(src, left);
+      right = Load2<1>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 2));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  __m128i row[8], below[8];
+
+  row[0] = LoadUnaligned16(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = LoadUnaligned16(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = LoadUnaligned16(src);
+      src += 16;
+      row[3] = LoadUnaligned16(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = LoadUnaligned16(src);
+        src += 16;
+        row[5] = LoadUnaligned16(src);
+        src += 16;
+        row[6] = LoadUnaligned16(src);
+        src += 16;
+        row[7] = LoadUnaligned16(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = LoadUnaligned16(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = LoadUnaligned16(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = LoadUnaligned16(src);
+        src += 16;
+        below[3] = LoadUnaligned16(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = LoadUnaligned16(src);
+          src += 16;
+          below[5] = LoadUnaligned16(src);
+          src += 16;
+          below[6] = LoadUnaligned16(src);
+          src += 16;
+          below[7] = LoadUnaligned16(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    __m128i row, below;
+    row = LoadLo8(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = LoadLo8(src);
+      src += reference_stride;
+
+      StoreLo8(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else if (width == 4) {
+    __m128i row = Load4(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      __m128i below = Load4(src);
+      src += reference_stride;
+
+      Store4(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else {
+    assert(width == 2);
+    __m128i row = Load2(src);
+    __m128i below = _mm_setzero_si128();
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = Load2<0>(src, below);
+      src += reference_stride;
+
+      Store2(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src,
+                                const uint8_t* LIBGAV1_RESTRICT src1) {
+  const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+  const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+  return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+  const __m128i a = _mm_add_epi16(v0, v1);
+  const __m128i b = _mm_srli_epi16(a, 1);
+  // Use avg here to shift right by 1 with round.
+  const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+  return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint8_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  __m128i row[16];
+  row[0] = LoadU8AndAddLong(src, src + 1);
+  if (width >= 16) {
+    src += 8;
+    row[1] = LoadU8AndAddLong(src, src + 1);
+    if (width >= 32) {
+      src += 8;
+      row[2] = LoadU8AndAddLong(src, src + 1);
+      src += 8;
+      row[3] = LoadU8AndAddLong(src, src + 1);
+      if (width >= 64) {
+        src += 8;
+        row[4] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[5] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[6] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[7] = LoadU8AndAddLong(src, src + 1);
+        if (width == 128) {
+          src += 8;
+          row[8] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[9] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[10] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[11] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[12] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[13] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[14] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[15] = LoadU8AndAddLong(src, src + 1);
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+    StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+      StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 4) {
+    __m128i left = _mm_cvtepu8_epi16(Load4(src));
+    __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+    src += reference_stride;
+
+    __m128i row = _mm_add_epi16(left, right);
+
+    int y = height;
+    do {
+      left = Load4(src);
+      right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  } else {
+    __m128i left = Load2(src);
+    __m128i right = Load2(src + 1);
+    src += reference_stride;
+
+    __m128i row =
+        _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<2>(src, left);
+      right = Load2<2>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+  dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/convolve_sse4.h b/src/dsp/x86/convolve_sse4.h
new file mode 100644 (file)
index 0000000..d6c3155
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
new file mode 100644 (file)
index 0000000..5548c5b
--- /dev/null
@@ -0,0 +1,974 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Common 128 bit functions used for sse4/avx2 convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+#include "src/dsp/convolve.inc"
+
+// This version checks for the special cases when filter_index == 1.
+int GetNumTapsInFilter(const int filter_index, const int filter_id) {
+  if (filter_index == 0) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 1) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
+         (filter_id == 8) | (filter_id == 9)) != 0) {
+      return 6;
+    }
+    // When |filter_index| == 1, the |filter_id| values not listed above map to
+    // 4 tap filters.
+    return 4;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int num_taps>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+  __m128i sum;
+  if (num_taps == 6) {
+    // 6 taps.
+    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm_add_epi16(sum, v_madd_65);
+  } else if (num_taps == 8) {
+    // 8 taps.
+    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (num_taps == 2) {
+    // 2 taps.
+    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int num_taps>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                             const __m128i* const v_tap) {
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+  if (num_taps == 2) {
+    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+    const __m128i v_src_43 = _mm_shuffle_epi8(
+        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+    return v_sum_43;
+  }
+
+  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+  const __m128i v_src_32 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+  const __m128i v_src_54 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
+                           static_cast<int>(0x80070706), 0x06050504));
+  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+  return v_sum_5432;
+}
+
+template <int num_taps>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int num_taps>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m128i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
+  } else if (num_taps == 6) {
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
+    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
+  } else {  // num_taps == 2
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+                                const __m128i* const taps) {
+  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m128i madd_lo =
+        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m128i madd_hi =
+        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const __m128i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m128i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned16(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned16(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned16(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned16(src_x);
+      src_x += src_stride;
+
+      const __m128i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned16(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+    if (num_taps >= 6) {
+      srcs[4] = LoadAligned16(src);
+      src += 8;
+      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+      if (num_taps == 8) {
+        srcs[6] = LoadAligned16(src);
+        src += 8;
+        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = LoadAligned16(src);
+    src += 8;
+    srcs[num_taps - 1] = _mm_unpacklo_epi64(
+        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      StoreUnaligned16(dst16, sum);
+      dst16 += 4 << 1;
+    } else {
+      const __m128i results = _mm_packus_epi16(sum, sum);
+      Store4(dst8, results);
+      dst8 += dst_stride;
+      Store4(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    if (num_taps == 8) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadAligned16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    } else if (num_taps == 4) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    } else if (num_taps == 6) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+    } else if (num_taps == 8) {
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+    }
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const __m128i results = _mm_packus_epi16(sum, sum);
+
+    Store2(dst8, results);
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 2));
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 4));
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 6));
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y -= 4;
+  } while (y != 0);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+  __m128i v_src[4];
+
+  if (num_taps == 6) {
+    // 6 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+  } else if (num_taps == 8) {
+    // 8 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+  } else if (num_taps == 2) {
+    // 2 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+  } else {
+    // 4 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+  }
+  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+  return sum;
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 10 11 12 13
+      const __m128i a = Load4(src);
+      // 00 01 02 03 10 11 12 13
+      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+      src += src_stride;
+      // 20 21 22 23
+      srcs[2] = Load4(src);
+      src += src_stride;
+      // 10 11 12 13 20 21 22 23
+      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+    int y = height;
+    do {
+      // 30 31 32 33
+      const __m128i b = Load4(src);
+      // 20 21 22 23 30 31 32 33
+      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+      src += src_stride;
+      // 40 41 42 43
+      srcs[4] = Load4(src);
+      src += src_stride;
+      // 30 31 32 33 40 41 42 43
+      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    srcs[6] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+    int y = height;
+    do {
+      // 50 51 52 53
+      const __m128i c = Load4(src);
+      // 40 41 42 43 50 51 52 53
+      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+      src += src_stride;
+      // 60 61 62 63
+      srcs[6] = Load4(src);
+      src += src_stride;
+      // 50 51 52 53 60 61 62 63
+      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    srcs[8] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+    // 50 51 52 53
+    const __m128i c = Load4(src);
+    // 40 41 42 43 50 51 52 53
+    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+    src += src_stride;
+    // 60 61 62 63
+    srcs[6] = Load4(src);
+    src += src_stride;
+    // 50 51 52 53 60 61 62 63
+    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+    int y = height;
+    do {
+      // 70 71 72 73
+      const __m128i d = Load4(src);
+      // 60 61 62 63 70 71 72 73
+      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+      src += src_stride;
+      // 80 81 82 83
+      srcs[8] = Load4(src);
+      src += src_stride;
+      // 70 71 72 73 80 81 82 83
+      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int num_taps, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41
+      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+      // This uses srcs[0]..srcs[1].
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+      // This uses srcs[0]..srcs[3].
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+    int y = height;
+    do {
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+      // This uses srcs[0]..srcs[5].
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 40 41 50 51
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    // 40 41 50 51 60 61
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+    // 20 21 30 31 40 41 50 51
+    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+    // 30 31 40 41 50 51 60 61
+    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+    int y = height;
+    do {
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91 a0 a1
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+
+      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+      // 60 61 70 71 80 81 90 91
+      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+      // 70 71 80 81 90 91 a0 a1
+      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+      // This uses srcs[0]..srcs[7].
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  }
+}
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
new file mode 100644 (file)
index 0000000..8c32117
--- /dev/null
@@ -0,0 +1,451 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1);
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+                                       const __m128i& pred1,
+                                       const __m128i& weight) {
+  // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+  // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+  //    8(=kInterPostRoundBit + 4)
+  // The formula is manipulated to avoid lengthening to 32 bits.
+  // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+  // = (p0 - p1) * w0 + 16 * p1
+  // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+  const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1);
+  // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4)
+  const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight);
+  // ((p0 - p1) * w0 >> 4) + p1
+  const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1);
+  // (x << 11) >> 15 == x >> 4
+  const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust);
+  // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+  return _mm_mulhrs_epi16(upscaled_average, right_shift_prep);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
+
+  for (int y = 0; y < height; y += 4) {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+    const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+    Store4(dst, result_pixels);
+    dst += dest_stride;
+    const int result_1 = _mm_extract_epi32(result_pixels, 1);
+    memcpy(dst, &result_1, sizeof(result_1));
+    dst += dest_stride;
+    const int result_2 = _mm_extract_epi32(result_pixels, 2);
+    memcpy(dst, &result_2, sizeof(result_2));
+    dst += dest_stride;
+    const int result_3 = _mm_extract_epi32(result_pixels, 3);
+    memcpy(dst, &result_3, sizeof(result_3));
+    dst += dest_stride;
+  }
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+    const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+    StoreLo8(dst, result_pixels);
+    dst += dest_stride;
+    StoreHi8(dst, result_pixels);
+    dst += dest_stride;
+  }
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+      const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+      const __m128i res_lo =
+          ComputeWeightedAverage8(src_0_lo, src_1_lo, weights);
+
+      const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+      const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+      const __m128i res_hi =
+          ComputeWeightedAverage8(src_0_hi, src_1_hi, weights);
+
+      StoreUnaligned16(dst + x, _mm_packus_epi16(res_lo, res_hi));
+      x += 16;
+    } while (x < width);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  const uint8_t weight_0,
+                                  const uint8_t /*weight_1*/, const int width,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
+                                  const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  const uint8_t weight = weight_0;
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+                                         dest_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+                                         dest_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+                                          dest_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+                                           dest_stride);
+        return;
+      case 8:
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+                                           dest_stride);
+        return;
+      case 16:
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+                                            dest_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest,
+                                            dest_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest,
+                                    dest_stride);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(DistanceWeightedBlend)
+  dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+                                       const __m128i& pred1,
+                                       const __m128i& weight0,
+                                       const __m128i& weight1) {
+  // This offset is a combination of round_factor and round_offset
+  // which are to be added and subtracted respectively.
+  // Here kInterPostRoundBit + 4 is considering bitdepth=10.
+  constexpr int offset =
+      (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bias = _mm_set1_epi32(offset);
+  const __m128i clip_high = _mm_set1_epi16(kMax10bppSample);
+
+  __m128i prediction0 = _mm_cvtepu16_epi32(pred0);
+  __m128i mult0 = _mm_mullo_epi32(prediction0, weight0);
+  __m128i prediction1 = _mm_cvtepu16_epi32(pred1);
+  __m128i mult1 = _mm_mullo_epi32(prediction1, weight1);
+  __m128i sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+
+  prediction0 = _mm_unpackhi_epi16(pred0, zero);
+  mult0 = _mm_mullo_epi32(prediction0, weight0);
+  prediction1 = _mm_unpackhi_epi16(pred1, zero);
+  mult1 = _mm_mullo_epi32(prediction1, weight1);
+  sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+  const __m128i pack = _mm_packus_epi32(result0, result1);
+
+  return _mm_min_epi16(pack, clip_high);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+    StoreLo8(dst, res0);
+    dst += dest_stride;
+    StoreHi8(dst, res0);
+    dst += dest_stride;
+    StoreLo8(dst, res1);
+    dst += dest_stride;
+    StoreHi8(dst, res1);
+    dst += dest_stride;
+    y -= 4;
+  } while (y != 0);
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+    StoreUnaligned16(dst, res0);
+    dst += dest_stride;
+    StoreUnaligned16(dst, res1);
+    dst += dest_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+      const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+      const __m128i res_lo =
+          ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1);
+
+      const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+      const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+      const __m128i res_hi =
+          ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1);
+
+      StoreUnaligned16(dst + x, res_lo);
+      x += 8;
+      StoreUnaligned16(dst + x, res_hi);
+      x += 8;
+    } while (x < width);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  const uint8_t weight_0,
+                                  const uint8_t weight_1, const int width,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
+                                  const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0);
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                          dest, dst_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 8:
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 16:
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+                                    height, dest, dst_stride);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend)
+  dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h
new file mode 100644 (file)
index 0000000..dbb9f88
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
new file mode 100644 (file)
index 0000000..59d18a6
--- /dev/null
@@ -0,0 +1,494 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+// The function is overloaded for each type and bitdepth for simplicity.
+inline __m128i LoadSource(const int8_t* src) {
+  return _mm_cvtepi8_epi16(LoadLo8(src));
+}
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+inline __m128i LoadSource(const uint8_t* src) {
+  return _mm_cvtepu8_epi16(LoadLo8(src));
+}
+
+inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
+  return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
+}
+
+// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
+inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
+  StoreLo8(dest, _mm_packus_epi16(data, data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Load 8 values from source.
+inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
+
+// Load 8 values from source.
+inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
+
+// Store 8 values to dest.
+inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
+  StoreUnaligned16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16(luma);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8(luma));
+}
+
+inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
+  }
+  return LoadUnaligned16(luma);
+}
+
+inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(
+            LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
+            LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
+        1);
+  }
+  return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline __m128i Clip3(const __m128i value, const __m128i low,
+                     const __m128i high) {
+  const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
+  return _mm_max_epi16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline __m128i GetScalingFactors(const int16_t* scaling_lut,
+                                 const Pixel* source) {
+  alignas(16) int16_t start_vals[8];
+  static_assert(bitdepth <= kBitdepth10,
+                "SSE4 Film Grain is not yet implemented for 12bpp.");
+  for (int i = 0; i < 8; ++i) {
+    assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+    start_vals[i] = scaling_lut[source[i]];
+  }
+  return LoadAligned16(start_vals);
+}
+
+// |scaling_shift| is in range [8,11].
+template <int bitdepth>
+inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
+                          const __m128i scaling_shift) {
+  const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
+  return _mm_mulhrs_epi16(noise, shifted_scale_factors);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_SSE4_1(
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+    int scaling_shift, int width, int height, int start_height,
+    const int16_t* scaling_lut_y, const void* source_plane_y,
+    ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_luma);
+  const int safe_width = width & ~7;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x + 8 <= safe_width; x += 8) {
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+
+    if (x < width) {
+      Pixel luma_buffer[8];
+      // Prevent arbitrary indices from entering GetScalingFactors.
+      memset(luma_buffer, 0, sizeof(luma_buffer));
+      const int valid_range = width - x;
+      assert(valid_range < 8);
+      memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+    in_y_row += source_stride_y;
+    out_y_row += dest_stride_y;
+  } while (++y < height);
+  out_y_row = static_cast<Pixel*>(dest_plane_y);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline __m128i BlendChromaValsWithCfl(
+    const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
+    const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+    const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
+    const __m128i scaling_shift) {
+  const __m128i scaling =
+      GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+  const __m128i orig = LoadSource(chroma_cursor);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
+    const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
+    const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+    Pixel* out_chroma_row, ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+  alignas(16) Pixel luma_buffer[16];
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
+  // need to be guarded from overread, even if |chroma_width| is divisible by 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+
+  // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+  // in GetScalingFactors.
+  Pixel average_luma_buffer[8];
+  assert(start_height % 2 == 0);
+  start_height >>= subsampling_y;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x + 8 <= safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    if (x < chroma_width) {
+      // Prevent huge indices from entering GetScalingFactors due to
+      // uninitialized values. This is not a problem in 8bpp because the table
+      // is made larger than 255 values.
+      if (bitdepth > kBitdepth8) {
+        memset(luma_buffer, 0, sizeof(luma_buffer));
+      }
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_SSE4_1(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+  BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+      source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
+inline __m128i BlendChromaValsNoCfl8bpp(
+    const int16_t* scaling_lut, const __m128i& orig,
+    const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+    const __m128i& average_luma, const __m128i& scaling_shift,
+    const __m128i& offset, const __m128i& weights) {
+  uint8_t merged_buffer[8];
+  const __m128i combined_lo =
+      _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
+  const __m128i combined_hi =
+      _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
+  const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
+                                              _mm_srai_epi32((combined_hi), 6));
+
+  const __m128i merged = _mm_add_epi16(merged_base, offset);
+
+  StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
+  const __m128i scaling =
+      GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
+    const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
+    const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+    uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
+  // will need to be guarded from overread, even if |chroma_width| is a
+  // multiple of 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+  alignas(16) uint8_t luma_buffer[16];
+  const __m128i offset = _mm_set1_epi16(chroma_offset);
+  const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
+                                             (luma_multiplier & 0xFFFF));
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x + 8 <= safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      // There is no need to pre-initialize this buffer, because merged values
+      // used as indices are saturated in the 8bpp case. Uninitialized values
+      // are written outside the frame.
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const int valid_range_chroma = chroma_width - x;
+      uint8_t chroma_buffer[8];
+      memcpy(chroma_buffer, &in_chroma_row[x],
+             valid_range_chroma * sizeof(in_chroma_row[0]));
+
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      const __m128i orig_chroma =
+          LoadSourceMsan(chroma_buffer, valid_range_chroma);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+      // End of right edge iteration.
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_SSE4_1(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane8bpp_SSE4_1(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+      luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+      source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace film_grain
+
+void FilmGrainInit_SSE4_1() {
+  film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/film_grain_sse4.h b/src/dsp/x86/film_grain_sse4.h
new file mode 100644 (file)
index 0000000..1cacbac
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
new file mode 100644 (file)
index 0000000..967be06
--- /dev/null
@@ -0,0 +1,273 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+    {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxEdgeBufferSize = 129;
+
+// This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
+// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
+// write as overlapping sets of 8-bytes.
+inline void ComputeKernel1Store12(uint8_t* LIBGAV1_RESTRICT dest,
+                                  const uint8_t* LIBGAV1_RESTRICT source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+  // Samples matched with the '4' tap, expanded to 16-bit.
+  const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+  const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+  // Samples matched with the '8' tap, expanded to 16-bit.
+  const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+  const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+
+  // Apply the taps by shifting.
+  const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2);
+  const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2);
+  const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3);
+  const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3);
+  // Move latter 4x values down to add with first 4x values for each output.
+  const __m128i partial_sums_lo =
+      _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4));
+  const __m128i partial_sums_hi =
+      _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_lo = RightShiftWithRounding_U16(
+      _mm_add_epi16(partial_sums_lo, centers8_lo), 4);
+  const __m128i sums_hi = RightShiftWithRounding_U16(
+      _mm_add_epi16(partial_sums_hi, centers8_hi), 4);
+
+  const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+  const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+  const __m128i result =
+      _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+  StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
+// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
+// be overwritten or safely discarded.
+inline void ComputeKernel2Store12(uint8_t* LIBGAV1_RESTRICT dest,
+                                  const uint8_t* LIBGAV1_RESTRICT source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+  const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+  const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+  const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+  const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+  // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x.
+  const __m128i outers5_lo =
+      _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2));
+  const __m128i outers5_hi =
+      _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2));
+  // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x.
+  const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1),
+                                            _mm_slli_epi16(centers_lo, 2));
+  const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1),
+                                            _mm_slli_epi16(centers_hi, 2));
+  // Move latter 5x values down to add with first 5x values for each output.
+  const __m128i partial_sums_lo =
+      _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_lo = RightShiftWithRounding_U16(
+      _mm_add_epi16(centers6_lo, partial_sums_lo), 4);
+  // Shift latter 5x values to add with first 5x values for each output.
+  const __m128i partial_sums_hi =
+      _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_hi = RightShiftWithRounding_U16(
+      _mm_add_epi16(centers6_hi, partial_sums_hi), 4);
+  // First 6 values are valid outputs.
+  const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+  const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+  const __m128i result =
+      _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+  StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
+inline void ComputeKernel3Store8(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const uint8_t* LIBGAV1_RESTRICT source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
+  // Finish |edge_lo| life cycle quickly.
+  // Multiply for 2x.
+  const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1);
+  // Multiply 2x by 2 and align.
+  const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2);
+  // Finish |source2| life cycle quickly.
+  // Move latter 2x values down to add with first 2x values for each output.
+  __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8));
+  // First 4x values already aligned to add with running total.
+  sum = _mm_add_epi16(sum, source4_lo);
+  // Move second 4x values down to add with running total.
+  sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2));
+  // Move third 4x values down to add with running total.
+  sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4));
+  // Multiply for 2x.
+  const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1);
+  // Multiply 2x by 2 and align.
+  const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2);
+  // Move latter 2x values down to add with first 2x values for each output.
+  __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8));
+  // First 4x values already aligned to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, source4_hi);
+  // Move second 4x values down to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2));
+  // Move third 4x values down to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4));
+
+  // Because we have only 8 values here, it is safe to align before packing down
+  // to 8-bit without losing data.
+  sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8);
+  sum = RightShiftWithRounding_U16(sum, 4);
+  StoreLo8(dest, _mm_packus_epi16(sum, sum));
+}
+
+void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) {
+  uint8_t edge[kMaxEdgeBufferSize + 4];
+  memcpy(edge, buffer, size);
+  auto* dst_buffer = static_cast<uint8_t*>(buffer);
+
+  // Only process |size| - 1 elements. Nothing to do in this case.
+  if (size == 1) return;
+
+  int i = 0;
+  switch (strength) {
+    case 1:
+      // To avoid overwriting, we stop short from the total write size plus the
+      // initial offset. In this case 12 valid values are written in two blocks
+      // of 8 bytes each.
+      for (; i < size - 17; i += 12) {
+        ComputeKernel1Store12(dst_buffer + i + 1, edge + i);
+      }
+      break;
+    case 2:
+      // See the comment for case 1.
+      for (; i < size - 17; i += 12) {
+        ComputeKernel2Store12(dst_buffer + i + 1, edge + i);
+      }
+      break;
+    default:
+      assert(strength == 3);
+      // The first filter input is repeated for taps of value 2 and 4.
+      dst_buffer[1] = RightShiftWithRounding(
+          (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4);
+      // In this case, one block of 8 bytes is written in each iteration, with
+      // an offset of 2.
+      for (; i < size - 10; i += 8) {
+        ComputeKernel3Store8(dst_buffer + i + 2, edge + i);
+      }
+  }
+  const int kernel_index = strength - 1;
+  for (int final_index = Clip3(i, 1, size - 2); final_index < size;
+       ++final_index) {
+    int sum = 0;
+    for (int j = 0; j < kKernelTaps; ++j) {
+      const int k = Clip3(final_index + j - 2, 0, size - 1);
+      sum += kKernels[kernel_index][j] * edge[k];
+    }
+    dst_buffer[final_index] = RightShiftWithRounding(sum, 4);
+  }
+}
+
+constexpr int kMaxUpsampleSize = 16;
+
+// Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and
+// interleaves the results with the original values. This implementation assumes
+// that it is safe to write the maximum number of upsampled pixels (32) to the
+// edge buffer, even when |size| is small.
+void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) {
+  assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+  auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+  uint8_t temp[kMaxUpsampleSize + 8];
+  temp[0] = temp[1] = pixel_buffer[-1];
+  memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+  temp[size + 2] = pixel_buffer[size - 1];
+
+  pixel_buffer[-2] = temp[0];
+  const __m128i data = LoadUnaligned16(temp);
+  const __m128i src_lo = _mm_cvtepu8_epi16(data);
+  const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
+  const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3));
+  const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3));
+  __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo);
+  sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4));
+  sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6));
+  sum_lo = RightShiftWithRounding_S16(sum_lo, 4);
+  const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo),
+                                              _mm_srli_si128(data, 2));
+  StoreUnaligned16(pixel_buffer - 1, result_lo);
+  if (size > 8) {
+    const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16));
+    const __m128i src9_hi_extra =
+        _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3));
+    __m128i sum_hi =
+        _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi);
+    sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4));
+    sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6));
+    sum_hi = RightShiftWithRounding_S16(sum_hi, 4);
+    const __m128i result_hi =
+        _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10));
+    StoreUnaligned16(pixel_buffer + 15, result_hi);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter)
+  dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler)
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void IntraEdgeInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intra_edge_sse4.h b/src/dsp/x86/intra_edge_sse4.h
new file mode 100644 (file)
index 0000000..6ed4d40
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
new file mode 100644 (file)
index 0000000..eb7e466
--- /dev/null
@@ -0,0 +1,1844 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+  return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+  const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+  return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreLo8(luma_ptr, result);
+  StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+  return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreUnaligned16(luma_ptr, result);
+  return result;
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+                                   __m128i alpha_sign, __m128i dc_q0) {
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+template <int width, int height>
+void CflIntraPredictor_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i alpha_sign = _mm_set1_epi16(alpha);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  auto* row = reinterpret_cast<const __m128i*>(luma);
+  const int kCflLumaBufferStrideLog2_16i = 5;
+  const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
+  const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+  const __m128i dc_val = _mm_set1_epi16(dst[0]);
+  do {
+    __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+    if (width < 16) {
+      res = _mm_packus_epi16(res, res);
+      if (width == 4) {
+        Store4(dst, res);
+      } else {
+        StoreLo8(dst, res);
+      }
+    } else {
+      __m128i next =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      res = _mm_packus_epi16(res, next);
+      StoreUnaligned16(dst, res);
+      if (width == 32) {
+        res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+        next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+        res = _mm_packus_epi16(res, next);
+        StoreUnaligned16(dst + 16, res);
+      }
+    }
+    dst += stride;
+  } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint8_t*>(source);
+  __m128i sum = _mm_setzero_si128();
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i samples;
+  int y = 0;
+  do {
+    samples = Load4(src);
+    src += stride;
+    int src_bytes;
+    memcpy(&src_bytes, src, 4);
+    samples = _mm_insert_epi32(samples, src_bytes, 1);
+    src += stride;
+    samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
+    StoreLo8(luma_ptr, samples);
+    luma_ptr += kCflLumaBufferStride;
+    StoreHi8(luma_ptr, samples);
+    luma_ptr += kCflLumaBufferStride;
+
+    // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
+    // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
+    sum = _mm_add_epi16(sum, samples);
+    y += 2;
+  } while (y < visible_height);
+
+  if (!is_inside) {
+    // Replicate the 2 high lanes.
+    samples = _mm_shuffle_epi32(samples, 0xee);
+    do {
+      StoreLo8(luma_ptr, samples);
+      luma_ptr += kCflLumaBufferStride;
+      StoreHi8(luma_ptr, samples);
+      luma_ptr += kCflLumaBufferStride;
+      sum = _mm_add_epi16(sum, samples);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
+  sum = _mm_cvtepu16_epi32(sum);
+  sum = _mm_add_epi32(sum, sum_tmp);
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      sum, block_height_log2 + 2 /* log2 of width 4 */);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  static_cast<void>(max_luma_width);
+  constexpr int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_height_log2, bool inside>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 5, "");
+  const int block_height = 1 << block_height_log2, block_width = 8;
+  const int visible_height = max_luma_height;
+  const int invisible_width = inside ? 0 : block_width - max_luma_width;
+  const int visible_width = max_luma_width;
+  const __m128i blend_mask =
+      inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  // Since the maximum height is 32, if we split them by parity, each one only
+  // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
+  // store them in 16 bits without casting to 32 bits.
+  __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
+  __m128i sum;
+  __m128i samples1;
+
+  int y = 0;
+  do {
+    __m128i samples0 = LoadLo8(src);
+    if (!inside) {
+      const __m128i border0 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+      samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
+    }
+    src += stride;
+    samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
+    StoreUnaligned16(luma_ptr, samples0);
+    luma_ptr += kCflLumaBufferStride;
+
+    sum_even = _mm_add_epi16(sum_even, samples0);
+
+    samples1 = LoadLo8(src);
+    if (!inside) {
+      const __m128i border1 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+      samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
+    }
+    src += stride;
+    samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
+    StoreUnaligned16(luma_ptr, samples1);
+    luma_ptr += kCflLumaBufferStride;
+
+    sum_odd = _mm_add_epi16(sum_odd, samples1);
+    y += 2;
+  } while (y < visible_height);
+
+  if (!inside) {
+    for (int y = visible_height; y < block_height; y += 2) {
+      sum_even = _mm_add_epi16(sum_even, samples1);
+      StoreUnaligned16(luma_ptr, samples1);
+      luma_ptr += kCflLumaBufferStride;
+
+      sum_odd = _mm_add_epi16(sum_odd, samples1);
+      StoreUnaligned16(luma_ptr, samples1);
+      luma_ptr += kCflLumaBufferStride;
+    }
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
+                      _mm_cvtepu16_epi32(sum_even));
+  sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
+  sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      sum, block_height_log2 + 3 /* log2 of width 8 */);
+  averages = _mm_shuffle_epi8(averages, dup16);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+// This function will only work for block_width 16 and 32.
+template <int block_width_log2, int block_height_log2, bool inside>
+void CflSubsampler444_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 1 << block_width_log2;
+
+  const int visible_height = max_luma_height;
+  const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
+  const int invisible_width_16 = 16 - visible_width_16;
+  const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
+  const int visible_width_32 = inside ? 32 : max_luma_width;
+  const int invisible_width_32 = 32 - visible_width_32;
+  const __m128i blend_mask_32 =
+      MaskHighNBytes(std::min(16, invisible_width_32));
+
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i zero = _mm_setzero_si128();
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  __m128i sum = _mm_setzero_si128();
+
+  __m128i samples0, samples1;
+  __m128i samples2, samples3;
+  __m128i inner_sum_lo, inner_sum_hi;
+  int y = 0;
+  do {
+    // We can load uninitialized values here. Even though they are then masked
+    // off by blendv, MSAN doesn't model that behavior.
+    __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
+
+    if (!inside) {
+      const __m128i border16 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
+      samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
+    }
+    samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
+    samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
+
+    StoreUnaligned16(luma_ptr, samples0);
+    StoreUnaligned16(luma_ptr + 8, samples1);
+    __m128i inner_sum = _mm_add_epi16(samples0, samples1);
+
+    if (block_width == 32) {
+      // We can load uninitialized values here. Even though they are then masked
+      // off by blendv, MSAN doesn't model that behavior.
+      __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
+      if (!inside) {
+        const __m128i border32 =
+            _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
+        samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
+      }
+      samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
+      samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
+
+      StoreUnaligned16(luma_ptr + 16, samples2);
+      StoreUnaligned16(luma_ptr + 24, samples3);
+      inner_sum = _mm_add_epi16(samples2, inner_sum);
+      inner_sum = _mm_add_epi16(samples3, inner_sum);
+    }
+
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    sum = _mm_add_epi32(sum, inner_sum_lo);
+    sum = _mm_add_epi32(sum, inner_sum_hi);
+    luma_ptr += kCflLumaBufferStride;
+    src += stride;
+  } while (++y < visible_height);
+
+  if (!inside) {
+    for (int y = visible_height; y < block_height;
+         luma_ptr += kCflLumaBufferStride, ++y) {
+      sum = _mm_add_epi32(sum, inner_sum_lo);
+      StoreUnaligned16(luma_ptr, samples0);
+      sum = _mm_add_epi32(sum, inner_sum_hi);
+      StoreUnaligned16(luma_ptr + 8, samples1);
+      if (block_width == 32) {
+        StoreUnaligned16(luma_ptr + 16, samples2);
+        StoreUnaligned16(luma_ptr + 24, samples3);
+      }
+    }
+  }
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages =
+      RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
+  averages = _mm_shuffle_epi8(averages, dup16);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    for (int x = 0; x < block_width; x += 8) {
+      __m128i samples = LoadUnaligned16(&luma_ptr[x]);
+      StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
+    }
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 1 << block_width_log2;
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = 0;
+  do {
+    // Note that double sampling and converting to 16bit makes a row fill the
+    // vector.
+    const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+    const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+    __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+    const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y += 4;
+  } while (y < luma_height);
+  const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+  for (; y < block_height; ++y) {
+    StoreLo8(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint8_t*>(source);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = 0;
+
+  do {
+    const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row01 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row00);
+    src += stride;
+    const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row11 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row10);
+    src += stride;
+    const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+    __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row21 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row20);
+    src += stride;
+    const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row31 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row30);
+    src += stride;
+    const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+    const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row41 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row40);
+    src += stride;
+    const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row51 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row50);
+    src += stride;
+    const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+    const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row61 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row60);
+    src += stride;
+    const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row71 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row70);
+    src += stride;
+    const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+    const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y += 4;
+  } while (y < luma_height);
+  // Duplicate the final row downward to the end after max_luma_height.
+  const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+  const __m128i final_fill_to_sum1 =
+      _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+  const __m128i final_fill_to_sum =
+      _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+  for (; y < block_height; ++y) {
+    StoreUnaligned16(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  static_assert(max_luma_width <= 32, "");
+
+  int16_t* luma_ptr = luma[0];
+  __m128i final_row_result;
+  // Begin first y section, covering width up to 32.
+  int y = 0;
+  do {
+    const uint8_t* src_next = src + stride;
+    const __m128i samples_row0_lo = LoadUnaligned16(src);
+    const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
+    const __m128i samples_row01 = (max_luma_width >= 16)
+                                      ? _mm_unpackhi_epi8(samples_row0_lo, zero)
+                                      : LastRowSamples(samples_row00);
+    const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
+    const __m128i samples_row02 = (max_luma_width >= 24)
+                                      ? _mm_cvtepu8_epi16(samples_row0_hi)
+                                      : LastRowSamples(samples_row01);
+    const __m128i samples_row03 = (max_luma_width == 32)
+                                      ? _mm_unpackhi_epi8(samples_row0_hi, zero)
+                                      : LastRowSamples(samples_row02);
+    const __m128i samples_row1_lo = LoadUnaligned16(src_next);
+    const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
+    const __m128i samples_row11 = (max_luma_width >= 16)
+                                      ? _mm_unpackhi_epi8(samples_row1_lo, zero)
+                                      : LastRowSamples(samples_row10);
+    const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
+    const __m128i samples_row12 = (max_luma_width >= 24)
+                                      ? _mm_cvtepu8_epi16(samples_row1_hi)
+                                      : LastRowSamples(samples_row11);
+    const __m128i samples_row13 = (max_luma_width == 32)
+                                      ? _mm_unpackhi_epi8(samples_row1_hi, zero)
+                                      : LastRowSamples(samples_row12);
+    const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+    const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+    const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+    __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_row_result =
+        StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    sum = _mm_add_epi16(sum, final_row_result);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      sum = _mm_add_epi16(sum, wide_fill);
+      sum = _mm_add_epi16(sum, wide_fill);
+    }
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    src += stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (++y < luma_height);
+
+  // Begin second y section.
+  if (y < block_height) {
+    const __m128i final_fill0 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+    const __m128i final_fill1 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+
+    const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+    const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+    const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+    const __m128i final_fill_to_sum =
+        _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+    do {
+      StoreUnaligned16(luma_ptr, final_fill0);
+      StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+
+      final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_width_log2 + block_height_log2);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples0 = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+    const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+    final_row_result = _mm_sub_epi16(samples1, averages);
+    StoreUnaligned16(luma_ptr + 8, final_row_result);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
+    }
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_SSE4_1<32, 32>;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+                                   __m128i alpha_sign, __m128i dc_q0) {
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  constexpr int kCflLumaBufferStrideLog2_16i = 5;
+  constexpr int kCflLumaBufferStrideLog2_128i =
+      kCflLumaBufferStrideLog2_16i - 3;
+  constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i alpha_sign = _mm_set1_epi16(alpha);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  auto* row = reinterpret_cast<const __m128i*>(luma);
+  const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+  const __m128i dc_val = _mm_set1_epi16(dst[0]);
+  const __m128i min = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+  stride >>= 1;
+
+  do {
+    __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+    res = ClipEpi16(res, min, max);
+    if (width == 4) {
+      StoreLo8(dst, res);
+    } else if (width == 8) {
+      StoreUnaligned16(dst, res);
+    } else if (width == 16) {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+    } else {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+      const __m128i res_2 =
+          CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+      const __m128i res_3 =
+          CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+    }
+
+    dst += stride;
+  } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadHi8(LoadLo8(src), src + src_stride);
+    src += src_stride << 1;
+    sum = _mm_add_epi16(sum, samples);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples = _mm_unpackhi_epi64(samples, samples);
+    do {
+      sum = _mm_add_epi16(sum, samples);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadLo8(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadUnaligned16(src);
+    src += src_stride;
+    sum = _mm_add_epi16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = _mm_add_epi16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadUnaligned16(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i zero = _mm_setzero_si128();
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i sum = zero;
+  __m128i inner_sum_lo, inner_sum_hi;
+  __m128i samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = LoadUnaligned16(src);
+    samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+                                        : LastRowResult(samples[0]);
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+                                          : LastRowResult(samples[2]);
+
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    sum = _mm_add_epi32(sum, inner_sum_lo);
+    sum = _mm_add_epi32(sum, inner_sum_hi);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    do {
+      sum = _mm_add_epi32(sum, inner_sum_lo);
+      sum = _mm_add_epi32(sum, inner_sum_hi);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  __m128i averages =
+      RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  __m128i samples_ext = zero;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        samples[idx] = LoadUnaligned16(&src[x]);
+        samples[idx] = _mm_slli_epi16(samples[idx], 3);
+        samples_ext = samples[idx];
+      } else {
+        samples[idx] = LastRowResult(samples_ext);
+      }
+      StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+    }
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row0 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row1 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+    const __m128i samples_row2 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row3 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+    __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const __m128i samples_row4 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row5 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+    const __m128i samples_row6 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row7 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreLo8(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    src += src_stride;
+    const __m128i samples_row10 = LoadUnaligned16(src);
+    const __m128i samples_row11 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row10);
+    src += src_stride;
+    const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+    __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row20 = LoadUnaligned16(src);
+    const __m128i samples_row21 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row20);
+    src += src_stride;
+    const __m128i samples_row30 = LoadUnaligned16(src);
+    const __m128i samples_row31 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row30);
+    src += src_stride;
+    const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+    const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row40 = LoadUnaligned16(src);
+    const __m128i samples_row41 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row40);
+    src += src_stride;
+    const __m128i samples_row50 = LoadUnaligned16(src);
+    const __m128i samples_row51 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row50);
+    src += src_stride;
+    const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+    const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row60 = LoadUnaligned16(src);
+    const __m128i samples_row61 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row60);
+    src += src_stride;
+    const __m128i samples_row70 = LoadUnaligned16(src);
+    const __m128i samples_row71 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row70);
+    src += src_stride;
+    const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+    const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+  const __m128i final_fill_to_sum1 =
+      _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+  const __m128i final_fill_to_sum =
+      _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreUnaligned16(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+                                                          source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+        luma, max_luma_height, source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  __m128i final_row_result;
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    const __m128i samples_row02 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src + 16)
+                                      : LastRowSamples(samples_row01);
+    const __m128i samples_row03 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src + 24)
+                                      : LastRowSamples(samples_row02);
+    const __m128i samples_row10 = LoadUnaligned16(src_next);
+    const __m128i samples_row11 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src_next + 8)
+                                      : LastRowSamples(samples_row10);
+    const __m128i samples_row12 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src_next + 16)
+                                      : LastRowSamples(samples_row11);
+    const __m128i samples_row13 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src_next + 24)
+                                      : LastRowSamples(samples_row12);
+    const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+    const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+    const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+    __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_row_result =
+        StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    sum = _mm_add_epi16(sum, final_row_result);
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      final_sum = _mm_add_epi32(
+          final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    const __m128i final_fill0 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+    const __m128i final_fill1 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+    const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+    const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+    const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+    const __m128i final_fill_to_sum =
+        _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+    do {
+      StoreUnaligned16(luma_ptr, final_fill0);
+      StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_width_log2 + block_height_log2);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples0 = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+    const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+    final_row_result = _mm_sub_epi16(samples1, averages);
+    StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_cfl_sse4.h b/src/dsp/x86/intrapred_cfl_sse4.h
new file mode 100644 (file)
index 0000000..5d1a425
--- /dev/null
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
new file mode 100644 (file)
index 0000000..2e64d21
--- /dev/null
@@ -0,0 +1,1487 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+                                    const uint8_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    return;
+  }
+  int y = 0;
+  do {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    dst += stride;
+    memcpy(dst, top + offset + 4, width);
+    dst += stride;
+    memcpy(dst, top + offset + 5, width);
+    dst += stride;
+    memcpy(dst, top + offset + 6, width);
+    dst += stride;
+    memcpy(dst, top + offset + 7, width);
+    dst += stride;
+
+    offset += 8;
+    y += 8;
+  } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+                                 const uint8_t* const top, const int height,
+                                 const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+  const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+                                    : _mm_set_epi64x(0, 0x0403030202010100);
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  int y = 0;
+  int top_x = xstep;
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+    const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadLo8(top + top_base_x);
+    const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+    prod = RightShiftWithRounding_U16(prod, rounding_bits);
+    // Replace pixels from invalid range with top-right corner.
+    prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+    Store4(dst, _mm_packus_epi16(prod, prod));
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+                                   const uint8_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    // Corner-only section of the row.
+    memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const int width, const int height,
+                                    const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled);
+    return;
+  }
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = 0;
+    do {
+      int top_base_x = top_x >> scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi8(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+        __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+        vals = _mm_maddubs_epi16(vals, shifts);
+        vals = RightShiftWithRounding_U16(vals, rounding_bits);
+        StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (++y < height);
+    return;
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dest, top_row[max_base_x], width);
+        dest += stride;
+      }
+      return;
+    }
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    for (; x < width - 8;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(top_row + top_base_x);
+    } else {
+      const __m128i top_vals = LoadLo8(top_row + top_base_x);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+    StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    dest += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const int width, const int height,
+                                           const int xstep,
+                                           const bool upsampled_top) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+                          upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[4];
+  for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadLo8(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadLo8(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    result_block[x] = _mm_packus_epi16(vals, vals);
+  }
+  const __m128i result = Transpose4x4_U8(result_block);
+  // This is result_row0.
+  Store4(dest, result);
+  dest += stride;
+  const int result_row1 = _mm_extract_epi32(result, 1);
+  memcpy(dest, &result_row1, sizeof(result_row1));
+  dest += stride;
+  const int result_row2 = _mm_extract_epi32(result, 2);
+  memcpy(dest, &result_row2, sizeof(result_row2));
+  dest += stride;
+  const int result_row3 = _mm_extract_epi32(result, 3);
+  memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler =
+      _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[8];
+  for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+  }
+  Transpose8x8_U16(result_block, result_block);
+  for (int y = 0; y < height; ++y) {
+    StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int ystep,
+                                           const bool upsampled) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (width == 4 || height == 4) {
+    const ptrdiff_t stride4 = stride << 2;
+    if (upsampled) {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<true>(
+              dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    } else {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+                                      ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    }
+    return;
+  }
+
+  const ptrdiff_t stride8 = stride << 3;
+  if (upsampled) {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<true, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  } else {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<false, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds) {
+  const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds,
+                                     const __m128i& bounds_selector) {
+  const __m128i max_dest_x_vect =
+      _mm_shuffle_epi8(zone_bounds, bounds_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+                                                 const __m128i& shifts,
+                                                 const __m128i& sampler) {
+  const __m128i src_vals = LoadUnaligned16(source);
+  __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+  vals = _mm_maddubs_epi16(vals, shifts);
+  return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+  const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+  // Left_column and sampler are both offset by 15 so the indices are always
+  // positive.
+  const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+  for (int y = 0; y < 4; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+    // can work as shuffle indices. Some values may be out of bounds, but their
+    // pred results will be masked over by top prediction.
+    sampler = _mm_add_epi8(sampler, positive_offset);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column + (y << upsample_shift), shifts, sampler);
+    Store4(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_set1_epi8(1);
+  const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+  for (int y = 0; y < 8; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+    // Offset the relative index because ystep is negative in Zone 2 and shuffle
+    // indices must be nonnegative.
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    sampler = _mm_add_epi8(sampler, denegation);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+    // The specification adds (y << 6) to left_y, which is subject to
+    // upsampling, but this puts sampler indices out of the 0-15 range. It is
+    // equivalent to offset the source address by (y << upsample_shift) instead.
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+        sampler);
+    StoreLo8(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+  top_x -= xstep;
+
+  int top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+  DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+  DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+  DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+  DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  __m128i y_selector = _mm_set1_epi32(0x01000100);
+  const __m128i index_increment = _mm_set1_epi32(0x02020202);
+  for (int y = 0; y < height; ++y,
+           y_selector = _mm_add_epi8(y_selector, index_increment),
+           dest += stride) {
+    top_x -= xstep;
+    const int top_base_x = top_x >> scale_bits_x;
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+    DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+  }
+}
+
+template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_8xH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const int x, const int left_offset,
+    const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
+    const __m128i& left_y) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+
+  // Cover 8x4 case.
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  uint8_t* dst_x = dst + x;
+
+  // Round down to the nearest multiple of 8 (or 4, if height is 4).
+  const int max_top_only_y =
+      std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
+  DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                       max_top_only_y, -xstep, upsampled_top);
+  DirectionalZone1_4xH(dst_x + 4, stride,
+                       top_row + ((x + 4) << upsample_top_shift),
+                       max_top_only_y, -xstep, upsampled_top);
+  if (max_top_only_y == height) return;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  int y = max_top_only_y;
+  dst_x += stride * y;
+  const int xstep_y = xstep * y;
+  const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+  // All rows from |min_left_only_y| down for this set of columns, only need
+  // |left_column| to compute.
+  const int min_left_only_y =
+      Align(std::min(((x + 8) << 6) / xstep, height), 8);
+
+  __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+  __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+  int top_x = -xstep_y;
+
+  const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+  for (; y < min_left_only_y;
+       y += 8, dst_x += stride8,
+       xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+       xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+       top_x -= xstep8) {
+    // Pick up from the last y-value, using the 10% slower but secure method for
+    // left prediction.
+    if (shuffle_left_column) {
+      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    } else {
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+    }
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                      shift_mask),
+        1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+    DirectionalZone1Blend_8xH<upsampled_top, 8>(
+        dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+        xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+  }
+  // Loop over y for left_only rows.
+  for (; y < height; y += 8, dst_x += stride8) {
+    DirectionalZone3_8xH<upsampled_left, 8>(
+        dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
+        base_left_y, -ystep);
+  }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const uint8_t* const left_column,
+                                    const int width, const int height,
+                                    const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  // All columns from |min_top_only_x| to the right will only need |top_row|
+  // to compute. This assumes minimum |xstep| is 3.
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // Accumulate xstep across 8 rows.
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+  const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep8 = ystep << 3;
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+
+  // Analysis finds that, for most angles (ystep < 132), all segments that use
+  // both top_row and left_column can compute from left_column using byte
+  // shuffles from a single vector. For steeper angles, the shuffle is also
+  // fully reliable when x >= 32.
+  const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+  const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+  const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+  int x = 0;
+
+  for (int left_offset = -left_base_increment; x < min_shuffle_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_for_shift, xstep_bounds_base, left_y);
+  }
+  for (int left_offset = -left_base_increment; x < min_top_only_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_for_shift, xstep_bounds_base, left_y);
+  }
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+                                      const uint8_t* const top_row,
+                                      const uint8_t* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  const int xstep4 = xstep << 2;
+  const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+  __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep4 = ystep << 2;
+  const int left_base_increment4 = ystep4 >> 6;
+  // This is guaranteed to be less than 64, but accumulation may bring it past
+  // 64 for higher x values.
+  const int ystep_remainder4 = ystep4 & 0x3F;
+  const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+  const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which will go into the left_column offset.
+  // Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+  int x = 0;
+  // Loop over x for columns with a mixture of sources.
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+           left_y = _mm_add_epi16(left_y, increment_left4),
+           left_offset -= left_base_increment4) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 4.
+    const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute. Rounded up to the nearest multiple of 4.
+    const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    // Loop over y for mixed rows.
+    for (; y < min_left_only_y;
+         y += 4, dst_x += stride4,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+         top_x -= xstep4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+          left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_4x4<upsampled_top>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left-only rows, if any.
+    for (; y < height; y += 4, dst_x += stride4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    }
+  }
+  // Loop over top-only columns, if any.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int xstep, const int ystep,
+                                           const bool upsampled_top,
+                                           const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+#if LIBGAV1_MSAN
+  memset(top_buffer, 0x33, 128);
+  memset(left_buffer, 0x44, 128);
+#endif
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  if (width == 4 || height == 4) {
+    if (upsampled_left) {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                              width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      }
+    } else {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                                width, height, xstep, ystep);
+      }
+    }
+    return;
+  }
+  if (upsampled_left) {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                          width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    }
+  } else {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                            width, height, xstep, ystep);
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
+                                    const uint16_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    return;
+  }
+  int y = height;
+  do {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
+    dst += stride;
+
+    offset += 8;
+    y -= 8;
+  } while (y != 0);
+}
+
+// Produce a weighted average whose weights sum to 32.
+inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
+                               const __m128i& shifts,
+                               const __m128i& top_indices,
+                               const __m128i& final_top_val,
+                               const __m128i& border_index) {
+  const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
+  __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
+  prod = _mm_hadd_epi16(prod, prod);
+  const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
+
+  const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+  // Replace pixels from invalid range with top-right corner.
+  return _mm_blendv_epi8(result, final_top_val, past_max);
+}
+
+// When width is 4, only one load operation is needed per iteration. We also
+// avoid extra loop precomputations that cause too much overhead.
+inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
+                                 const uint16_t* const top, const int height,
+                                 const int xstep, const bool upsampled,
+                                 const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
+  // only cmpgt is available.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int y = 0;
+  int top_x = xstep;
+  const __m128i max_shift = _mm_set1_epi16(32);
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadUnaligned16(top + top_base_x);
+    const __m128i pred =
+        CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
+                        max_base_x_vect);
+    StoreLo8(dst, pred);
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// General purpose combine function.
+// |check_border| means the final source value has to be duplicated into the
+// result. This simplifies the loop structures that use precomputed boundaries
+// to identify sections where it is safe to compute without checking for the
+// right border.
+template <bool check_border>
+inline __m128i CombineTopVals(
+    const __m128i& top_vals_0, const __m128i& top_vals_1,
+    const __m128i& sampler, const __m128i& shifts,
+    const __m128i& top_indices = _mm_setzero_si128(),
+    const __m128i& final_top_val = _mm_setzero_si128(),
+    const __m128i& border_index = _mm_setzero_si128()) {
+  constexpr int scale_int_bits = 5;
+  const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
+  const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
+  const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
+  const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
+  const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
+  const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
+  if (check_border) {
+    const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+    // Replace pixels from invalid range with top-right corner.
+    return _mm_blendv_epi8(result, final_top_val, past_max);
+  }
+  return result;
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
+                                   const uint16_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled,
+                                   const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping to 1 is enough
+  // to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+
+      const __m128i pred =
+          CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+
+      StoreUnaligned16(dest + x, pred);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to |top_base_x|, it is used to mask values
+  // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
+  // which is not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+    // Corner-only section of the row.
+    Memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalIntraPredictorZone1_SSE4_1(
+    void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
+    const int width, const int height, const int xstep, const bool upsampled) {
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  auto* dest = static_cast<uint16_t*>(dest_ptr);
+  stride /= sizeof(uint16_t);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  // Each base pixel paired with its following pixel, for hadd purposes.
+  const __m128i adjacency_shuffler = _mm_set_epi16(
+      0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
+  // This is equivalent to not shuffling at all.
+  const __m128i identity_shuffler = _mm_set_epi16(
+      0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+  // This represents a trade-off between code size and speed. When upsampled
+  // is true, no shuffle is necessary. But to avoid in-loop branching, we
+  // would need 2 copies of the main function body.
+  const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
+                         sampler);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled, sampler);
+    return;
+  }
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = height;
+    do {
+      int top_base_x = top_x >> index_scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi16(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+        const __m128i top_vals_1 =
+            LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+        const __m128i pred =
+            CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+        StoreUnaligned16(dest + x, pred);
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (--y != 0);
+    return;
+  }
+
+  // General case. Blocks with width less than 32 do not benefit from x-wise
+  // loop splitting, but do benefit from using memset on appropriate rows.
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    for (int x = 0; x < width; x += 8, top_base_x += base_step8,
+             top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_directional_sse4.h b/src/dsp/x86/intrapred_directional_sse4.h
new file mode 100644 (file)
index 0000000..b352450
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc
new file mode 100644 (file)
index 0000000..a43a5cf
--- /dev/null
@@ -0,0 +1,433 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+// Section 7.11.2.3. Recursive intra prediction process
+// This filter applies recursively to 4x2 sub-blocks within the transform block,
+// meaning that the predicted pixels in each sub-block are used as inputs to
+// sub-blocks below and to the right, if present.
+//
+// Each output value in the sub-block is predicted by a different filter applied
+// to the same array of top-left, top, and left values. If fn refers to the
+// output of the nth filter, given this block:
+// TL T0 T1 T2 T3
+// L0 f0 f1 f2 f3
+// L1 f4 f5 f6 f7
+// The filter input order is p0, p1, p2, p3, p4, p5, p6:
+// p0 p1 p2 p3 p4
+// p5 f0 f1 f2 f3
+// p6 f4 f5 f6 f7
+// Filters usually apply to 8 values for convenience, so in this case we fix
+// the 8th filter tap to 0 and disregard the value of the 8th input.
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+// |pixels| contains p0-p7 in order as shown above.
+// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
+inline void Filter4x2_SSE4_1(uint8_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t stride, const __m128i& pixels,
+                             const __m128i& taps_0_1, const __m128i& taps_2_3,
+                             const __m128i& taps_4_5, const __m128i& taps_6_7) {
+  const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+  const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+  // |output_half| contains 8 partial sums for f0-f7.
+  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+  __m128i output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row0 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* unused half */ output);
+  Store4(dst, output_row0);
+  const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+  const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+  output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row1 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* arbitrary pack arg */ output);
+  Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. The top-left pixel, p0,
+// is stored in the top buffer for the first 4x2, but comes from the left buffer
+// for successive blocks. This implementation takes advantage of the fact
+// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
+// using shifts to arrange things to fit reusable shuffle vectors.
+inline void Filter4xH(uint8_t* LIBGAV1_RESTRICT dest, ptrdiff_t stride,
+                      const uint8_t* LIBGAV1_RESTRICT const top_ptr,
+                      const uint8_t* LIBGAV1_RESTRICT const left_ptr,
+                      FilterIntraPredictor pred, const int height) {
+  // Two filter kernels per vector.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+  __m128i top = Load4(top_ptr - 1);
+  __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+  __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+  left = _mm_slli_si128(left, 5);
+
+  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+  // left[2], left[3], left[4], left[5], left[6], left[7]
+  // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
+  //                   r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // Two sets of the same input pixels to apply two filters at once.
+  pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 1.
+  pixels = Load4(dest);
+
+  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+  // left[0], left[1], ...
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+  //                         r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+  // byte is an unused value, which shall be multiplied by 0 when we apply the
+  // filter.
+  constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  dest += stride;  // Move to y = 2.
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 3.
+
+  // Compute the middle 8 rows before using common code for the final 4 rows, in
+  // order to fit the assumption that |left| has the next TL at position 8.
+  if (height == 16) {
+    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+    left = _mm_slli_si128(left, 1);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+
+    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+    // last byte is an unused value, as above. The top-left was shifted to
+    // position nine to keep two empty spaces after the top pixels.
+    constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+    // the end.
+    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 4.
+
+    // First 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Clear all but final pixel in the first 8 of left column.
+    __m128i keep_top_left = _mm_srli_si128(left, 13);
+    dest += stride;  // Move to y = 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+    left = LoadLo8(left_ptr + 8);
+
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 6.
+
+    // Second 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Position TL value so we can use pixel_order1.
+    keep_top_left = _mm_slli_si128(keep_top_left, 6);
+    dest += stride;  // Move to y = 7.
+    pixels = Load4(dest);
+    left = _mm_slli_si128(left, 7);
+    left = _mm_or_si128(left, keep_top_left);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 8.
+
+    // Third 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 9.
+
+    // Prepare final inputs.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 10.
+
+    // Fourth 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 11.
+  }
+
+  // In both the 8 and 16 case at this point, we can assume that |left| has the
+  // next TL at position 8.
+  if (height > 4) {
+    // Erase prior left pixels by shifting TL to position 0.
+    left = _mm_srli_si128(left, 8);
+    left = _mm_slli_si128(left, 6);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 12 or 4.
+
+    // First of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 13 or 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 14 or 6.
+
+    // Last of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+  }
+}
+
+void FilterIntraPredictor_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                                 ptrdiff_t stride,
+                                 const void* LIBGAV1_RESTRICT const top_row,
+                                 const void* LIBGAV1_RESTRICT const left_column,
+                                 FilterIntraPredictor pred, const int width,
+                                 const int height) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+    return;
+  }
+
+  // There is one set of 7 taps for each of the 4x2 output pixels.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+
+  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+  // the end is an unused value, which shall be multiplied by 0 when we apply
+  // the filter.
+  constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+  // Takes the "left section" and puts it right after p0-p4.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+  // byte is unused as above.
+  constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+  // Shuffles the "top left" from the left section, to the front. Used when
+  // grabbing data from left_column and not top_row.
+  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+  // This first pass takes care of the cases where the top left pixel comes from
+  // top_row.
+  __m128i pixels = LoadLo8(top_ptr - 1);
+  __m128i left = _mm_slli_si128(Load4(left_column), 8);
+  pixels = _mm_or_si128(pixels, left);
+
+  // Two sets of the same pixels to multiply with two sets of taps.
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+  left = _mm_srli_si128(left, 1);
+
+  // Load
+  pixels = Load4(dst + stride);
+
+  // Because of the above shift, this OR 'invades' the final of the first 8
+  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+  // a padded 0.
+  pixels = _mm_or_si128(pixels, left);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+  const ptrdiff_t stride2 = stride << 1;
+  const ptrdiff_t stride4 = stride << 2;
+  Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dst += 4;
+  for (int x = 3; x < width - 4; x += 4) {
+    pixels = Load4(top_ptr + x);
+    pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+    pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    pixels = Load4(dst + stride - 1);
+    pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+    pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+    dst += 4;
+  }
+
+  // Now we handle heights that reference previous blocks rather than top_row.
+  for (int y = 4; y < height; y += 4) {
+    // Leftmost 4x4 block for this height.
+    dst -= width;
+    dst += stride4;
+
+    // Top Left is not available by offset in these leftmost blocks.
+    pixels = Load4(dst - stride);
+    left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+    left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+    left = _mm_srli_si128(left, 2);
+    pixels = Load4(dst + stride);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+
+    dst += 4;
+
+    // Remaining 4x4 blocks for this height.
+    for (int x = 4; x < width; x += 4) {
+      pixels = Load4(dst - stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                       taps_6_7);
+      pixels = Load4(dst + stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                       taps_4_5, taps_6_7);
+      dst += 4;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+  dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_filter_sse4.h b/src/dsp/x86/intrapred_filter_sse4.h
new file mode 100644 (file)
index 0000000..ce28f93
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
new file mode 100644 (file)
index 0000000..b53ee8c
--- /dev/null
@@ -0,0 +1,2687 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+template <int y_mask>
+inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest,
+                                      const __m128i& left,
+                                      const __m128i& weights,
+                                      const __m128i& scaled_top_right,
+                                      const __m128i& round) {
+  const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
+  const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
+                                     const __m128i& weights,
+                                     const __m128i& scaled_corner) {
+  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+  return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest,
+                                       const __m128i& pixels,
+                                       const __m128i& weights,
+                                       const __m128i& scaled_corner,
+                                       const __m128i& round) {
+  const __m128i pred_sum =
+      SmoothDirectionalSum8(pixels, weights, scaled_corner);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
+  StoreLo8(dest, _mm_packus_epi16(pred, pred));
+}
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+inline void WriteSmoothDirectionalSum16(
+    uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1,
+    const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2,
+    const __m128i& scaled_corner1, const __m128i& scaled_corner2,
+    const __m128i& round) {
+  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+  StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
+}
+
+template <int y_mask>
+inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest,
+                                const __m128i& top, const __m128i& left,
+                                const __m128i& weights_x,
+                                const __m128i& weights_y,
+                                const __m128i& scaled_bottom_left,
+                                const __m128i& scaled_top_right,
+                                const __m128i& round) {
+  const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
+  const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
+  const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
+  const __m128i scaled_bottom_left_y =
+      _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+  const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
+  const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
+  const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
+
+  // Equivalent to RightShiftWithRounding(pred[x][y], 9).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
+
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+                              const uint8_t* LIBGAV1_RESTRICT left,
+                              const int height, __m128i* pixels) {
+  if (height == 4) {
+    pixels[1] = Load4(left);
+  } else if (height == 8) {
+    pixels[1] = LoadLo8(left);
+  } else {
+    pixels[1] = LoadUnaligned16(left);
+  }
+
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  const __m128i top = _mm_cvtepu8_epi16(Load4(above));
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+  pixels[2] = _mm_set1_epi16(above[3]);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array,
+                               const int height, __m128i* weight_h,
+                               __m128i* weight_w) {
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i x_weights = Load4(weight_array);
+  weight_h[0] = _mm_cvtepu8_epi16(x_weights);
+  weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+  if (height == 8) {
+    const __m128i y_weights = LoadLo8(weight_array + 4);
+    weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+    weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i y_weights = LoadUnaligned16(weight_array + 12);
+    weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+    weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
+    weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
+  }
+}
+
+inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
+                               const __m128i* weight_x,
+                               uint8_t* LIBGAV1_RESTRICT dst,
+                               const ptrdiff_t stride,
+                               const bool use_second_half) {
+  const __m128i round = _mm_set1_epi32(256);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
+                                       : _mm_unpacklo_epi8(pixel[1], zero);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int i = 0; i < 8; ++i) {
+    const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+    const __m128i interleaved_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
+
+    __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
+    horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
+    __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
+
+    sum = _mm_add_epi32(vertical_pred, sum);
+    sum = _mm_add_epi32(sum, round);
+    sum = _mm_srai_epi32(sum, 9);
+
+    sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
+    Store4(dst, sum);
+    dst += stride;
+
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+// The interleaving approach has some overhead that causes it to underperform in
+// the 4x4 case.
+void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  const __m128i scale = _mm_set1_epi32(256);
+  // Fourth short is top_row[3].
+  const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
+  // Fourth short is left_column[3].
+  const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  auto* dst = static_cast<uint8_t*>(dest);
+  // AV1 spec 7.11.2.6 (3) describes the sum:
+  // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
+  // scaled_bottom[y] This could be a loop, but for the immediate value in the
+  // shuffles.
+  WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
+                         scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+}
+
+void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i weights_x[1];
+  __m128i weights_y[2];
+  LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
+  __m128i pixels[3];
+  LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+}
+
+void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i weights_x[1];
+  __m128i weights_y[4];
+  LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
+  __m128i pixels[3];
+  LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above,
+                              const uint8_t* LIBGAV1_RESTRICT left,
+                              const int height, __m128i* pixels) {
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
+  pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
+  pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
+
+  pixels[3] = _mm_set1_epi16(above[7]);
+
+  if (height == 4) {
+    pixels[2] = Load4(left);
+  } else if (height == 8) {
+    pixels[2] = LoadLo8(left);
+  } else if (height == 16) {
+    pixels[2] = LoadUnaligned16(left);
+  } else {
+    pixels[2] = LoadUnaligned16(left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = LoadUnaligned16(left + 16);
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array,
+                               const int height, __m128i* weight_w,
+                               __m128i* weight_h) {
+  const int offset = (height < 8) ? 0 : 4;
+  __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
+  weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+  const __m128i inverter = _mm_set1_epi16(256);
+  weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+
+  if (height == 4) {
+    loaded_weights = _mm_srli_si128(loaded_weights, 4);
+    __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
+    __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
+    weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
+    weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
+  } else {
+    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+  }
+
+  if (height == 16) {
+    const __m128i zero = _mm_setzero_si128();
+    loaded_weights = LoadUnaligned16(weight_array + 12);
+    weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+    weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
+    weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
+    weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
+    weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+    const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
+    weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
+    weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
+  }
+}
+
+inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
+                               const __m128i* weights_y, const int height,
+                               uint8_t* LIBGAV1_RESTRICT dst,
+                               const ptrdiff_t stride,
+                               const bool use_second_half) {
+  const __m128i round = _mm_set1_epi32(256);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
+                                       : _mm_unpacklo_epi8(pixels[2], zero);
+  __m128i y_select = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < height; ++i) {
+    const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+    const __m128i interleaved_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    const __m128i vertical_sum0 =
+        _mm_madd_epi16(pixels[0], interleaved_weights);
+    const __m128i vertical_sum1 =
+        _mm_madd_epi16(pixels[1], interleaved_weights);
+
+    __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
+    horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
+    const __m128i horizontal_sum0 =
+        _mm_madd_epi16(horizontal_pixels, weights_x[0]);
+    const __m128i horizontal_sum1 =
+        _mm_madd_epi16(horizontal_pixels, weights_x[1]);
+
+    __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
+    sum0 = _mm_add_epi32(sum0, round);
+    sum0 = _mm_srai_epi32(sum0, 9);
+
+    __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
+    sum1 = _mm_add_epi32(sum1, round);
+    sum1 = _mm_srai_epi32(sum1, 9);
+
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
+    StoreLo8(dst, sum0);
+    dst += stride;
+
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
+
+  __m128i weights_x[2], weights_y[2];
+  LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
+}
+
+void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
+
+  __m128i weights_x[2], weights_y[2];
+  LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+}
+
+void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
+
+  __m128i weights_x[2], weights_y[4];
+  LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+}
+
+void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[8];
+  LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
+
+  __m128i weights_x[2], weights_y[8];
+  LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+  dst += stride << 3;
+  WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
+                     false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
+                     true);
+}
+
+template <int width, int height>
+void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+               const void* LIBGAV1_RESTRICT const top_row,
+               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
+  const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value = _mm_set1_epi16(256);
+  const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
+  const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
+  const __m128i round = _mm_set1_epi32(256);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < height; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i scaled_bottom_left =
+        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i weight_left_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+    for (int x = 0; x < width; x += 8) {
+      const __m128i top_x = LoadLo8(top_ptr + x);
+      const __m128i weights_x = LoadLo8(sm_weights_w + x);
+      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
+      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+      // Here opposite weights and pixels are multiplied, where the order of
+      // interleaving is indicated in the names.
+      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+      // |scaled_bottom_left| is always scaled by the same weight each row, so
+      // we only derive |scaled_top_right| values here.
+      const __m128i inverted_weights_x =
+          _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
+      const __m128i scaled_top_right =
+          _mm_mullo_epi16(inverted_weights_x, top_right);
+      const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
+      const __m128i scaled_top_right_hi =
+          _mm_unpackhi_epi16(scaled_top_right, zero);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+      // The round value for RightShiftWithRounding was added with
+      // |scaled_bottom_left|.
+      pred_lo = _mm_srli_epi32(pred_lo, 9);
+      pred_hi = _mm_srli_epi32(pred_hi, 9);
+      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+    }
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest,
+                                const ptrdiff_t stride,
+                                const void* LIBGAV1_RESTRICT top_row,
+                                const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top[3]);
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top[3]);
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x4_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_select);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal8x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal8x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x4_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  __m128i y_mask = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+}
+
+void SmoothHorizontal16x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothHorizontal32x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  const __m128i left2 =
+      _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                  scaled_top_right3, scaled_top_right4, scale);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothHorizontal64x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal64x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal64x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                  scaled_top_right3, scaled_top_right4, scale);
+      WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                  scaled_top_right5, scaled_top_right6, scale);
+      WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                  scaled_top_right7, scaled_top_right8, scale);
+      dst += stride;
+    }
+  }
+}
+
+inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+                                      const uint8_t* LIBGAV1_RESTRICT left,
+                                      const int height, __m128i* pixels) {
+  __m128i top = Load4(above);
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  top = _mm_cvtepu8_epi16(top);
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT
+                                           weight_array,
+                                       const int height, __m128i* weights) {
+  const __m128i inverter = _mm_set1_epi16(256);
+
+  if (height == 4) {
+    const __m128i weight = Load4(weight_array);
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = LoadLo8(weight_array + 4);
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else {
+    const __m128i weight = LoadUnaligned16(weight_array + 12);
+    const __m128i zero = _mm_setzero_si128();
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(inverter, weights[2]);
+  }
+}
+
+inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
+                                   const int height,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32(128);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int y = 0; y < height; ++y) {
+    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+    const __m128i alternate_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+    // The madd instruction yields four results of the form:
+    // (top_row[x] * weight[y] + corner * inverted_weight[y])
+    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, 8);
+    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+    Store4(dst, sum);
+    dst += stride;
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
+}
+
+void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 8, &pixels);
+
+  __m128i weights[2];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+}
+
+void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
+}
+
+void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+  const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+}
+
+void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+  const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+}
+
+void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+  const __m128i scaled_bottom_left_lo =
+      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+  const __m128i scaled_bottom_left_hi =
+      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothVertical64x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical64x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical64x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      SmoothWxH<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      SmoothWxH<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      SmoothWxH<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      SmoothWxH<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      SmoothWxH<16, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      SmoothWxH<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      SmoothWxH<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      SmoothWxH<32, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      SmoothWxH<32, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      SmoothWxH<64, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      SmoothWxH<64, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      SmoothWxH<64, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x64_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_smooth_sse4.h b/src/dsp/x86/intrapred_smooth_sse4.h
new file mode 100644 (file)
index 0000000..9353371
--- /dev/null
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
new file mode 100644 (file)
index 0000000..556afed
--- /dev/null
@@ -0,0 +1,2200 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Utility Functions
+
+// This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
+// Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
+// block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
+// we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
+// bits.
+constexpr int kThreeInverse = 0x5556;
+constexpr int kFiveInverse = 0x3334;
+template <int shiftk, int multiplier>
+inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
+  const __m128i interm = _mm_srli_epi32(dividend, shiftk);
+  return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1
+
+using DcSumFunc = __m128i (*)(const void* ref);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
+using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
+                                    const __m128i column);
+// For copying an entire column across a block.
+using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
+                                 const void* column);
+
+// DC intra-predictors for non-square blocks.
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+struct DcPredFuncs_SSE4_1 {
+  DcPredFuncs_SSE4_1() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+};
+
+// Directional intra-predictors for square blocks.
+template <ColumnStoreFunc col_storefn>
+struct DirectionalPredFuncs_SSE4_1 {
+  DirectionalPredFuncs_SSE4_1() = delete;
+
+  static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+                       const void* left_column);
+  static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+                         const void* left_column);
+};
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* /*left_column*/) {
+  const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
+  const __m128i sum = top_sumfn(top_row);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* /*top_row*/,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
+  const __m128i sum = left_sumfn(left_column);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                 const void* LIBGAV1_RESTRICT const top_row,
+                 const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i rounder =
+      _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
+  const __m128i sum_top = top_sumfn(top_row);
+  const __m128i sum_left = left_sumfn(left_column);
+  const __m128i sum = _mm_add_epi32(sum_top, sum_left);
+  if (width_log2 == height_log2) {
+    const __m128i dc =
+        _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
+    storefn(dest, stride, dc);
+  } else {
+    const __m128i dc =
+        DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
+    storefn(dest, stride, dc);
+  }
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1 directional predictors
+
+template <ColumnStoreFunc col_storefn>
+void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+  col_storefn(dest, stride, left_column);
+}
+
+}  // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// |ref| points to 4 bytes containing 4 packed ints.
+inline __m128i DcSum4_SSE4_1(const void* const ref) {
+  const __m128i vals = Load4(ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum8_SSE4_1(const void* const ref) {
+  const __m128i vals = LoadLo8(ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum16_SSE4_1(const void* const ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals = LoadUnaligned16(ref);
+  const __m128i partial_sum = _mm_sad_epu8(vals, zero);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum32_SSE4_1(const void* const ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals1 = LoadUnaligned16(ref);
+  const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
+  const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+  const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+  const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum64_SSE4_1(const void* const ref) {
+  const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals1 = LoadUnaligned16(ref_ptr);
+  const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
+  const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
+  const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
+  const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+  const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+  __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+  const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
+  partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
+  const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
+  partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    Store4(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  Store4(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreLo8(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreLo8(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    StoreUnaligned16(dst + 16, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+  StoreUnaligned16(dst + 16, dc_dup);
+}
+
+template <int height>
+inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    StoreUnaligned16(dst + 16, dc_dup);
+    StoreUnaligned16(dst + 32, dc_dup);
+    StoreUnaligned16(dst + 48, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+  StoreUnaligned16(dst + 16, dc_dup);
+  StoreUnaligned16(dst + 32, dc_dup);
+  StoreUnaligned16(dst + 48, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
+// be copied for width N into dest.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  Store4(dst, dup32);
+  dst += stride;
+  const int row1 = _mm_extract_epi32(dup32, 1);
+  memcpy(dst, &row1, 4);
+  dst += stride;
+  const int row2 = _mm_extract_epi32(dup32, 2);
+  memcpy(dst, &row2, 4);
+  dst += stride;
+  const int row3 = _mm_extract_epi32(dup32, 3);
+  memcpy(dst, &row3, 4);
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+  auto* dst = static_cast<uint8_t*>(dest);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+  dst += stride;
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const __m128i col_data = Load4(column);
+  const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
+  writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i col_data = LoadLo8(column);
+  const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lo);
+  dst += stride4;
+  const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
+  writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
+  const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+  const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lolo);
+  dst += stride4;
+  const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+  writefn(dst, stride, col_dup32_lohi);
+  dst += stride4;
+  const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+  writefn(dst, stride, col_dup32_hilo);
+  dst += stride4;
+  const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+  writefn(dst, stride, col_dup32_hihi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 32; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+    const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+    const __m128i col_dup32_lolo =
+        _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lolo);
+    dst += stride4;
+    const __m128i col_dup32_lohi =
+        _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lohi);
+    dst += stride4;
+    const __m128i col_dup32_hilo =
+        _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hilo);
+    dst += stride4;
+    const __m128i col_dup32_hihi =
+        _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hihi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 64; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+    const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+    const __m128i col_dup32_lolo =
+        _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lolo);
+    dst += stride4;
+    const __m128i col_dup32_lohi =
+        _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lohi);
+    dst += stride4;
+    const __m128i col_dup32_hilo =
+        _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hilo);
+    dst += stride4;
+    const __m128i col_dup32_hihi =
+        _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hihi);
+    dst += stride4;
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore4xH_SSE4_1<4>, 0, 0>;
+  // shiftk is the smaller of width_log2 and height_log2.
+  // dc_mult corresponds to the ratio of the smaller block size to the larger.
+  using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
+                                  DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
+  using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
+                                   DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
+
+  using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
+  using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
+                                  DcStore8xH_SSE4_1<8>, 0, 0>;
+  using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
+                                   DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
+  using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
+                                   DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
+
+  using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
+                                   DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
+  using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
+                                   DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
+  using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore16xH_SSE4_1<16>, 0, 0>;
+  using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
+  using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
+
+  using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
+                                   DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
+  using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
+  using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore32xH_SSE4_1<32>, 0, 0>;
+  using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
+
+  using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
+  using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
+  using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore64xH_SSE4_1<64>, 0, 0>;
+};
+
+struct DirDefs {
+  DirDefs() = delete;
+
+  using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+  using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+  using _4x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+  using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+  using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+  using _8x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+  using _8x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+  using _16x4 =
+      DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+  using _16x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+  using _16x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+  using _16x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+  using _16x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+  using _32x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+  using _32x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+  using _32x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+  using _32x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+  using _64x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+  using _64x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+  using _64x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+template <int y_mask>
+inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+                            const __m128i& left, const __m128i& top_lefts,
+                            const __m128i& top_dists, const __m128i& left_dists,
+                            const __m128i& top_left_diffs) {
+  const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
+
+  const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i top_left_dists =
+      _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
+  not_select_left =
+      _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
+  const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
+
+  const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+  // The sequence of 32-bit packed operations was found (see CL via blame) to
+  // outperform 16-bit operations, despite the availability of the packus
+  // function, when tested on a Xeon E7 v3.
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  const __m128i pred = _mm_shuffle_epi8(
+      _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
+  Store4(dst, pred);
+}
+
+// top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
+// we would be able to do all of these operations as epi8 for a 16-pixel version
+// of this function. Still, since lefts_y is just a vector of duplicates, it
+// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
+// for the blends.
+template <int y_mask>
+inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+                            const __m128i& left, const __m128i& top_lefts,
+                            const __m128i& top_dists, const __m128i& left_dists,
+                            const __m128i& top_left_diffs) {
+  const __m128i select_y = _mm_set1_epi32(y_mask);
+  const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
+
+  const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
+  const __m128i top_left_dists =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
+  not_select_left =
+      _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
+  const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
+
+  const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+  const __m128i pred = _mm_packus_epi16(
+      _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+// |top| is an epi8 of length 16
+// |left| is epi8 of unknown length, as y_mask specifies access
+// |top_lefts| is an epi8 of 16 duplicates
+// |top_dists| is an epi8 of unknown length, as y_mask specifies access
+// |left_dists| is an epi8 of length 16
+// |left_dists_lo| is an epi16 of length 8
+// |left_dists_hi| is an epi16 of length 8
+// |top_left_diffs_lo| is an epi16 of length 8
+// |top_left_diffs_hi| is an epi16 of length 8
+// The latter two vectors are epi16 because their values may reach -510.
+// |left_dists| is provided alongside its spread out version because it doesn't
+// change between calls and interacts with both kinds of packing.
+template <int y_mask>
+inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+                             const __m128i& left, const __m128i& top_lefts,
+                             const __m128i& top_dists,
+                             const __m128i& left_dists,
+                             const __m128i& left_dists_lo,
+                             const __m128i& left_dists_hi,
+                             const __m128i& top_left_diffs_lo,
+                             const __m128i& top_left_diffs_hi) {
+  const __m128i select_y = _mm_set1_epi32(y_mask);
+  const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
+  const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
+  const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
+  const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
+
+  const __m128i top_left_dists_lo =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
+  const __m128i top_left_dists_hi =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
+
+  const __m128i left_gt_top_left_lo = _mm_packs_epi16(
+      _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
+  const __m128i left_gt_top_left_hi =
+      _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
+                      /* unused second arg for pack */ left_dists_hi);
+  const __m128i left_gt_top_left = _mm_alignr_epi8(
+      left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
+
+  const __m128i not_select_top_lo =
+      _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
+                      /* unused second arg for pack */ top_dists_y16);
+  const __m128i not_select_top_hi =
+      _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
+                      /* unused second arg for pack */ top_dists_y16);
+  const __m128i not_select_top = _mm_alignr_epi8(
+      not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
+
+  const __m128i left_leq_top =
+      _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
+  const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
+  const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
+
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+}
+
+void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadLo8(left_column);
+  const __m128i left_lo = _mm_cvtepu8_epi32(left);
+  const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
+  const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+}
+
+void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i left_0 = _mm_cvtepu8_epi32(left);
+  const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+  const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
+  const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
+
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
+  const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
+  const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
+  const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+}
+
+void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+}
+
+void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+}
+
+void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i left_lo = _mm_cvtepu8_epi16(left);
+  const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
+  const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+}
+
+void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
+  Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
+}
+
+void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = Load4(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+// Inlined for calling with offsets in larger transform sizes, mainly to
+// preserve top_left.
+inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const uint8_t top_left, const __m128i top,
+                           const __m128i left) {
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top_left,
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i left = LoadLo8(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
+                     const __m128i top, const __m128i left) {
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left_0 = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top, left_0);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
+}
+
+void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const ptrdiff_t stride16 = stride << 4;
+  const __m128i left_0 = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top, left_0);
+  dst += stride16;
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  WritePaeth16x16(dst, stride, top_left, top, left_1);
+  dst += stride16;
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  WritePaeth16x16(dst, stride, top_left, top, left_2);
+  dst += stride16;
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  WritePaeth16x16(dst, stride, top_left, top, left_3);
+}
+
+void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadLo8(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x8(dst, stride, top_left, top_0, left);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+}
+
+void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+}
+
+void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
+}
+
+void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+}
+
+void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Paeth16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Paeth16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Paeth16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Paeth16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Paeth16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Paeth32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Paeth32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Paeth32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Paeth32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Paeth64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Paeth64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Paeth64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DirDefs::_64x64::Horizontal;
+#endif
+}  // NOLINT(readability/fn_size)
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreLo8(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreLo8(dst, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
+// identical shorts that need N total copies written into dest. The unpacking
+// works the same as in the 8bpp case, except that each 32-bit unit needs twice
+// as many copies.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  auto* dst = static_cast<uint8_t*>(dest);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+  dst += stride;
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
+  }
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
+  }
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
+  }
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
+  }
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding row in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const __m128i col_data = LoadLo8(column);
+  const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
+  writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const __m128i col_data = LoadUnaligned16(column);
+  const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+  const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lo);
+  const ptrdiff_t stride4 = stride << 2;
+  dst += stride4;
+  writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 32; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 64; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 128; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+// |ref| points to 8 bytes containing 4 packed int16 values.
+inline __m128i DcSum4_SSE4_1(const void* ref) {
+  const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
+  const __m128i ones = _mm_set1_epi16(1);
+
+  // half_sum[31:0]  = a1+a2
+  // half_sum[63:32] = a3+a4
+  const __m128i half_sum = _mm_madd_epi16(vals, ones);
+  // Place half_sum[63:32] in shift_sum[31:0].
+  const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
+  return _mm_add_epi32(half_sum, shift_sum);
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore4xH_SSE4_1<4>, 0, 0>;
+};
+
+struct DirDefs {
+  DirDefs() = delete;
+
+  using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+  using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+  using _4x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+  using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+  using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+  using _8x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+  using _8x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+  using _16x4 =
+      DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+  using _16x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+  using _16x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+  using _16x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+  using _16x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+  using _32x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+  using _32x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+  using _32x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+  using _32x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+  using _64x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+  using _64x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+  using _64x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DirDefs::_64x64::Horizontal;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h
new file mode 100644 (file)
index 0000000..1f6f30a
--- /dev/null
@@ -0,0 +1,591 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors. See the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
new file mode 100644 (file)
index 0000000..e9ceb87
--- /dev/null
@@ -0,0 +1,3053 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst,
+                                    int32_t stride, int32_t idx,
+                                    const __m128i* s) {
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (store_width == 16) {
+    for (int i = 0; i < store_count; i += 4) {
+      StoreUnaligned16(&dst[i * stride + idx], s[i]);
+      StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
+      StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
+      StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
+    }
+  }
+  if (store_width == 8) {
+    for (int i = 0; i < store_count; i += 4) {
+      StoreLo8(&dst[i * stride + idx], s[i]);
+      StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
+      StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
+      StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
+    }
+  }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src,
+                                   int32_t stride, int32_t idx, __m128i* x) {
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (load_width == 16) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = LoadUnaligned16(&src[i * stride + idx]);
+      x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
+      x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
+      x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
+    }
+  }
+  if (load_width == 8) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = LoadLo8(&src[i * stride + idx]);
+      x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
+      x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
+      x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
+    }
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i psin_pcos = _mm_set1_epi32(
+      static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+  const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+  const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
+  // -sin cos, -sin cos, -sin cos, -sin cos
+  const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+  const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+  const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+  const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+  const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+  const __m128i x = _mm_packs_epi32(x1, x1);
+  const __m128i y = _mm_packs_epi32(y1, y1);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i psin_pcos = _mm_set1_epi32(
+      static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
+  // -sin cos, -sin cos, -sin cos, -sin cos
+  const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+  const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+  const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+  const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
+  const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
+  const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+  const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+  const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
+  const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
+  const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+  const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+  const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
+  const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
+  const __m128i x = _mm_packs_epi32(x1, x1_hi);
+  const __m128i y = _mm_packs_epi32(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+  const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
+  const __m128i x = _mm_mulhrs_epi16(*b, psin);
+  const __m128i y = _mm_mulhrs_epi16(*b, pcos);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
+                                                          __m128i* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+  const __m128i psin = _mm_set1_epi16(sin128 << 3);
+  const __m128i x = _mm_mulhrs_epi16(*a, pcos);
+  const __m128i y = _mm_mulhrs_epi16(*a, psin);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
+  __m128i x, y;
+  if (flip) {
+    y = _mm_adds_epi16(*b, *a);
+    x = _mm_subs_epi16(*b, *a);
+  } else {
+    x = _mm_adds_epi16(*a, *b);
+    y = _mm_subs_epi16(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
+                                       bool flip);
+
+LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
+                                            const __m128i v_row_shift_add,
+                                            const __m128i v_row_shift) {
+  const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
+  // The max row_shift is 2, so int16_t values greater than 0x7ffd may
+  // overflow.  Generate a mask for this case.
+  const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
+  const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
+  // Assume int16_t values.
+  const __m128i a = _mm_sra_epi16(x, v_row_shift);
+  // Assume uint16_t values.
+  const __m128i b = _mm_srl_epi16(x, v_row_shift);
+  // Select the correct shifted value.
+  return _mm_blendv_epi8(a, b, mask);
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_src =
+      (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+  const int16_t cos128 = Cos128(32);
+  const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
+
+  // Expand to 32 bits to prevent int16_t overflows during the shift add.
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_cvtepi16_epi32(xy);
+  const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
+  const __m128i b = _mm_add_epi32(a, v_row_shift_add);
+  const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
+  const __m128i c = _mm_sra_epi32(b, v_row_shift);
+  const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
+  const __m128i xy_shifted = _mm_packs_epi32(c, c1);
+
+  if (width == 4) {
+    StoreLo8(dst, xy_shifted);
+  } else {
+    for (int i = 0; i < width; i += 8) {
+      StoreUnaligned16(dst, xy_shifted);
+      dst += 8;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const __m128i v_src = LoadLo8(dst);
+    const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+    StoreLo8(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadUnaligned16(&dst[i]);
+      const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+      StoreUnaligned16(&dst[i], xy);
+      i += 8;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  HadamardRotation(&s[0], &s[3], false);
+  HadamardRotation(&s[1], &s[2], false);
+}
+
+// Process 4 dct4 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[4], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4_U16(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+  }
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x4To4x8_U16(s, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4_U16(s, s);
+    }
+    StoreDst<8, 4>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false);
+  HadamardRotation(&s[6], &s[7], true);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  HadamardRotation(&s[0], &s[7], false);
+  HadamardRotation(&s[1], &s[6], false);
+  HadamardRotation(&s[2], &s[5], false);
+  HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, 0, input);
+      Transpose8x8_U16(input, x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x8_U16(s, output);
+      StoreDst<16, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, s);
+    }
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false);
+  HadamardRotation(&s[10], &s[11], true);
+  HadamardRotation(&s[12], &s[13], false);
+  HadamardRotation(&s[14], &s[15], true);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false);
+  HadamardRotation(&s[9], &s[10], false);
+  HadamardRotation(&s[12], &s[15], true);
+  HadamardRotation(&s[13], &s[14], true);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  HadamardRotation(&s[0], &s[15], false);
+  HadamardRotation(&s[1], &s[14], false);
+  HadamardRotation(&s[2], &s[13], false);
+  HadamardRotation(&s[3], &s[12], false);
+  HadamardRotation(&s[4], &s[11], false);
+  HadamardRotation(&s[5], &s[10], false);
+  HadamardRotation(&s[6], &s[9], false);
+  HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8_U16(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i input[8];
+        LoadSrc<16, 8>(dst, step, idx, input);
+        Transpose8x8_U16(input, &x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4_U16(&s[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i output[8];
+        Transpose8x8_U16(&s[idx], output);
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, s);
+    }
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false);
+  HadamardRotation(&s[18], &s[19], true);
+  HadamardRotation(&s[20], &s[21], false);
+  HadamardRotation(&s[22], &s[23], true);
+  HadamardRotation(&s[24], &s[25], false);
+  HadamardRotation(&s[26], &s[27], true);
+  HadamardRotation(&s[28], &s[29], false);
+  HadamardRotation(&s[30], &s[31], true);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false);
+  HadamardRotation(&s[17], &s[18], false);
+  HadamardRotation(&s[20], &s[23], true);
+  HadamardRotation(&s[21], &s[22], true);
+  HadamardRotation(&s[24], &s[27], false);
+  HadamardRotation(&s[25], &s[26], false);
+  HadamardRotation(&s[28], &s[31], true);
+  HadamardRotation(&s[29], &s[30], true);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false);
+  HadamardRotation(&s[17], &s[22], false);
+  HadamardRotation(&s[18], &s[21], false);
+  HadamardRotation(&s[19], &s[20], false);
+  HadamardRotation(&s[24], &s[31], true);
+  HadamardRotation(&s[25], &s[30], true);
+  HadamardRotation(&s[26], &s[29], true);
+  HadamardRotation(&s[27], &s[28], true);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  HadamardRotation(&s[0], &s[31], false);
+  HadamardRotation(&s[1], &s[30], false);
+  HadamardRotation(&s[2], &s[29], false);
+  HadamardRotation(&s[3], &s[28], false);
+  HadamardRotation(&s[4], &s[27], false);
+  HadamardRotation(&s[5], &s[26], false);
+  HadamardRotation(&s[6], &s[25], false);
+  HadamardRotation(&s[7], &s[24], false);
+  HadamardRotation(&s[8], &s[23], false);
+  HadamardRotation(&s[9], &s[22], false);
+  HadamardRotation(&s[10], &s[21], false);
+  HadamardRotation(&s[11], &s[20], false);
+  HadamardRotation(&s[12], &s[19], false);
+  HadamardRotation(&s[13], &s[18], false);
+  HadamardRotation(&s[14], &s[17], false);
+  HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
+                                        const bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[32], x[32];
+
+  if (transpose) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, idx, input);
+      Transpose8x8_U16(input, &x[idx]);
+    }
+  } else {
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_8>(s);
+  Dct8Stages<ButterflyRotation_8>(s);
+  Dct16Stages<ButterflyRotation_8>(s);
+  Dct32Stages<ButterflyRotation_8>(s);
+
+  if (transpose) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i output[8];
+      Transpose8x8_U16(&s[idx], output);
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 32>(dst, step, 0, s);
+  }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[64], x[32];
+
+  if (transpose) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, idx, input);
+      Transpose8x8_U16(input, &x[idx]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false);
+  HadamardRotation(&s[34], &s[35], true);
+  HadamardRotation(&s[36], &s[37], false);
+  HadamardRotation(&s[38], &s[39], true);
+  HadamardRotation(&s[40], &s[41], false);
+  HadamardRotation(&s[42], &s[43], true);
+  HadamardRotation(&s[44], &s[45], false);
+  HadamardRotation(&s[46], &s[47], true);
+  HadamardRotation(&s[48], &s[49], false);
+  HadamardRotation(&s[50], &s[51], true);
+  HadamardRotation(&s[52], &s[53], false);
+  HadamardRotation(&s[54], &s[55], true);
+  HadamardRotation(&s[56], &s[57], false);
+  HadamardRotation(&s[58], &s[59], true);
+  HadamardRotation(&s[60], &s[61], false);
+  HadamardRotation(&s[62], &s[63], true);
+
+  // stage 7.
+  ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false);
+  HadamardRotation(&s[33], &s[34], false);
+  HadamardRotation(&s[36], &s[39], true);
+  HadamardRotation(&s[37], &s[38], true);
+  HadamardRotation(&s[40], &s[43], false);
+  HadamardRotation(&s[41], &s[42], false);
+  HadamardRotation(&s[44], &s[47], true);
+  HadamardRotation(&s[45], &s[46], true);
+  HadamardRotation(&s[48], &s[51], false);
+  HadamardRotation(&s[49], &s[50], false);
+  HadamardRotation(&s[52], &s[55], true);
+  HadamardRotation(&s[53], &s[54], true);
+  HadamardRotation(&s[56], &s[59], false);
+  HadamardRotation(&s[57], &s[58], false);
+  HadamardRotation(&s[60], &s[63], true);
+  HadamardRotation(&s[61], &s[62], true);
+
+  // stage 16.
+  ButterflyRotation_8(&s[61], &s[34], 56, true);
+  ButterflyRotation_8(&s[60], &s[35], 56, true);
+  ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false);
+  HadamardRotation(&s[33], &s[38], false);
+  HadamardRotation(&s[34], &s[37], false);
+  HadamardRotation(&s[35], &s[36], false);
+  HadamardRotation(&s[40], &s[47], true);
+  HadamardRotation(&s[41], &s[46], true);
+  HadamardRotation(&s[42], &s[45], true);
+  HadamardRotation(&s[43], &s[44], true);
+  HadamardRotation(&s[48], &s[55], false);
+  HadamardRotation(&s[49], &s[54], false);
+  HadamardRotation(&s[50], &s[53], false);
+  HadamardRotation(&s[51], &s[52], false);
+  HadamardRotation(&s[56], &s[63], true);
+  HadamardRotation(&s[57], &s[62], true);
+  HadamardRotation(&s[58], &s[61], true);
+  HadamardRotation(&s[59], &s[60], true);
+
+  // stage 25.
+  ButterflyRotation_8(&s[59], &s[36], 48, true);
+  ButterflyRotation_8(&s[58], &s[37], 48, true);
+  ButterflyRotation_8(&s[57], &s[38], 48, true);
+  ButterflyRotation_8(&s[56], &s[39], 48, true);
+  ButterflyRotation_8(&s[55], &s[40], 112, true);
+  ButterflyRotation_8(&s[54], &s[41], 112, true);
+  ButterflyRotation_8(&s[53], &s[42], 112, true);
+  ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false);
+  HadamardRotation(&s[33], &s[46], false);
+  HadamardRotation(&s[34], &s[45], false);
+  HadamardRotation(&s[35], &s[44], false);
+  HadamardRotation(&s[36], &s[43], false);
+  HadamardRotation(&s[37], &s[42], false);
+  HadamardRotation(&s[38], &s[41], false);
+  HadamardRotation(&s[39], &s[40], false);
+  HadamardRotation(&s[48], &s[63], true);
+  HadamardRotation(&s[49], &s[62], true);
+  HadamardRotation(&s[50], &s[61], true);
+  HadamardRotation(&s[51], &s[60], true);
+  HadamardRotation(&s[52], &s[59], true);
+  HadamardRotation(&s[53], &s[58], true);
+  HadamardRotation(&s[54], &s[57], true);
+  HadamardRotation(&s[55], &s[56], true);
+
+  // stage 30.
+  ButterflyRotation_8(&s[55], &s[40], 32, true);
+  ButterflyRotation_8(&s[54], &s[41], 32, true);
+  ButterflyRotation_8(&s[53], &s[42], 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 32, true);
+  ButterflyRotation_8(&s[50], &s[45], 32, true);
+  ButterflyRotation_8(&s[49], &s[46], 32, true);
+  ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+  }
+  //-- end dct 64 stages
+
+  if (transpose) {
+    for (int idx = 0; idx < 64; idx += 8) {
+      __m128i output[8];
+      Transpose8x8_U16(&s[idx], output);
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 64>(dst, step, 0, s);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4_U16(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+  }
+
+  const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
+  const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
+  const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
+  const __m128i kAdst4Multiplier_m0_1 =
+      _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
+                     (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
+  const __m128i kAdst4Multiplier_3_0 =
+      _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
+                     (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
+
+  // stage 1.
+  const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
+  const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
+  const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
+  const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
+  const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
+
+  s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
+  s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
+
+  // stage 2.
+  // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
+  const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
+  const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
+  const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
+
+  // stage 3.
+  s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
+  s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
+  s[2] = b7;
+  s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
+
+  // stage 4.
+  s[0] = _mm_add_epi32(s[0], s[5]);
+  s[1] = _mm_sub_epi32(s[1], s[6]);
+
+  // stages 5 and 6.
+  x[0] = _mm_add_epi32(s[0], s[3]);
+  x[1] = _mm_add_epi32(s[1], s[3]);
+  x[2] = _mm_add_epi32(s[0], s[1]);
+  x[3] = _mm_sub_epi32(x[2], s[3]);
+
+  x[0] = RightShiftWithRounding_S32(x[0], 12);
+  x[1] = RightShiftWithRounding_S32(x[1], 12);
+  x[2] = RightShiftWithRounding_S32(s[2], 12);
+  x[3] = RightShiftWithRounding_S32(x[3], 12);
+
+  x[0] = _mm_packs_epi32(x[0], x[1]);
+  x[2] = _mm_packs_epi32(x[2], x[3]);
+  x[1] = _mm_srli_si128(x[0], 8);
+  x[3] = _mm_srli_si128(x[2], 8);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x4To4x8_U16(x, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+    StoreDst<8, 4>(dst, step, 0, x);
+  }
+}
+
+constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
+                                               3344, 0, 2482, 1321};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src =
+      _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+  const __m128i v_kAdst4DcOnlyMultipliers =
+      LoadUnaligned16(kAdst4DcOnlyMultiplier);
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  // +
+  // s0*0  s0*0  s0*0  s0*k0
+  const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
+  const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  const __m128i c = _mm_packs_epi32(b, b);
+  StoreLo8(dst, c);
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
+    const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
+    const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
+    const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
+    const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
+    const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
+    const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
+    const __m128i x0 = s0;
+    const __m128i x1 = s1;
+    const __m128i x2 = s2;
+    const __m128i x3 = _mm_add_epi32(s0, s1);
+    const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
+    const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
+    const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
+    const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
+    const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
+    const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
+    StoreLo8(&dst[i], dst_0_1);
+    StoreHi8(&dst[i + width * 1], dst_0_1);
+    StoreLo8(&dst[i + width * 2], dst_2_3);
+    StoreHi8(&dst[i + width * 3], dst_2_3);
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, 0, input);
+      Transpose8x8_U16(input, x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[3], &s[7], false);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[4]);
+  x[2] = s[6];
+  x[3] = _mm_subs_epi16(v_zero, s[2]);
+  x[4] = s[3];
+  x[5] = _mm_subs_epi16(v_zero, s[7]);
+  x[6] = s[5];
+  x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x8_U16(x, output);
+      StoreDst<16, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[8];
+
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  // stage 1.
+  s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  __m128i x[8];
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[4]);
+  x[2] = s[6];
+  x[3] = _mm_subs_epi16(v_zero, s[2]);
+  x[4] = s[3];
+  x[5] = _mm_subs_epi16(v_zero, s[7]);
+  x[6] = s[5];
+  x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+  const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
+  const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
+  const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
+  const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
+  const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+  const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+  const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+  StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[8];
+
+  int i = 0;
+  do {
+    const __m128i v_src = LoadLo8(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    __m128i x[8];
+    const __m128i v_zero = _mm_setzero_si128();
+    x[0] = s[0];
+    x[1] = _mm_subs_epi16(v_zero, s[4]);
+    x[2] = s[6];
+    x[3] = _mm_subs_epi16(v_zero, s[2]);
+    x[4] = s[3];
+    x[5] = _mm_subs_epi16(v_zero, s[7]);
+    x[6] = s[5];
+    x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      StoreLo8(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+                                         bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8_U16(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i input[8];
+        LoadSrc<16, 8>(dst, step, idx, input);
+        Transpose8x8_U16(input, &x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false);
+  HadamardRotation(&s[1], &s[9], false);
+  HadamardRotation(&s[2], &s[10], false);
+  HadamardRotation(&s[3], &s[11], false);
+  HadamardRotation(&s[4], &s[12], false);
+  HadamardRotation(&s[5], &s[13], false);
+  HadamardRotation(&s[6], &s[14], false);
+  HadamardRotation(&s[7], &s[15], false);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[8], &s[12], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[9], &s[13], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[10], &s[14], false);
+  HadamardRotation(&s[3], &s[7], false);
+  HadamardRotation(&s[11], &s[15], false);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[8], &s[10], false);
+  HadamardRotation(&s[12], &s[14], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+  HadamardRotation(&s[9], &s[11], false);
+  HadamardRotation(&s[13], &s[15], false);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[8]);
+  x[2] = s[12];
+  x[3] = _mm_subs_epi16(v_zero, s[4]);
+  x[4] = s[6];
+  x[5] = _mm_subs_epi16(v_zero, s[14]);
+  x[6] = s[10];
+  x[7] = _mm_subs_epi16(v_zero, s[2]);
+  x[8] = s[3];
+  x[9] = _mm_subs_epi16(v_zero, s[11]);
+  x[10] = s[15];
+  x[11] = _mm_subs_epi16(v_zero, s[7]);
+  x[12] = s[5];
+  x[13] = _mm_subs_epi16(v_zero, s[13]);
+  x[14] = s[9];
+  x[15] = _mm_subs_epi16(v_zero, s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4_U16(&x[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i output[8];
+        Transpose8x8_U16(&x[idx], output);
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[8]);
+  x[2] = s[12];
+  x[3] = _mm_subs_epi16(v_zero, s[4]);
+  x[4] = s[6];
+  x[5] = _mm_subs_epi16(v_zero, s[14]);
+  x[6] = s[10];
+  x[7] = _mm_subs_epi16(v_zero, s[2]);
+  x[8] = s[3];
+  x[9] = _mm_subs_epi16(v_zero, s[11]);
+  x[10] = s[15];
+  x[11] = _mm_subs_epi16(v_zero, s[7]);
+  x[12] = s[5];
+  x[13] = _mm_subs_epi16(v_zero, s[13]);
+  x[14] = s[9];
+  x[15] = _mm_subs_epi16(v_zero, s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[16];
+  __m128i x[16];
+
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  // stage 1.
+  s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 2; ++i) {
+    const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
+    const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
+    const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
+    const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
+    const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+    const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+    const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+    const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+    const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+    const __m128i a1 =
+        _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+    const __m128i b = _mm_sra_epi32(a, v_row_shift);
+    const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+    StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
+  }
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    __m128i s[16];
+    __m128i x[16];
+    const __m128i v_src = LoadUnaligned16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      StoreLo8(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  if (is_row_shift) {
+    const int shift = 1;
+    const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+    const __m128i v_multiplier_one =
+        _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+    for (int i = 0; i < 4; i += 2) {
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+      const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+      const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
+      const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+      const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
+      const __m128i b = _mm_srai_epi32(a, 12 + shift);
+      const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
+      StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
+    }
+  } else {
+    const __m128i v_multiplier =
+        _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
+    for (int i = 0; i < 4; i += 2) {
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+      const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
+      const __m128i b = _mm_adds_epi16(a, v_src);
+      StoreUnaligned16(&dst[i * step], b);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
+
+  const int shift = (tx_height < 16) ? 0 : 1;
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+  const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
+  const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
+  const __m128i b = _mm_srai_epi32(a, 12 + shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  const __m128i v_multiplier_fraction =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+  const __m128i v_eight = _mm_set1_epi16(8);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_mult =
+            _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  const __m128i v_multiplier_fraction =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
+      const __m128i v_src_mult2 =
+          _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+      const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+      const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+      const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_adds_epi16(frame_data16, b);
+      Store4(dst, _mm_packus_epi16(c, c));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_round =
+            _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+        const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
+        const __m128i v_src_mult2 =
+            _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+        const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+        const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_adds_epi16(frame_data16, b);
+        StoreLo8(dst + j, _mm_packus_epi16(c, c));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
+    StoreUnaligned16(&dst[h * step], v_src_mult);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+    // saturating add here is ok.
+    const __m128i a = _mm_adds_epi16(v_src, v_src);
+    StoreUnaligned16(&dst[h * step], a);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src =
+      _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
+  const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      const __m128i v_src = LoadLo8(&source[row]);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+      const __m128i frame_data = Load4(dst);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+                                                int shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+  const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
+    const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
+    const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
+    const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
+    const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
+    const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
+    const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
+    const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
+    const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
+    const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
+    const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
+    const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
+    const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
+    StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
+    StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round0 =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+  const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+  const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+  const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+  const __m128i b = _mm_sra_epi32(a, v_shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const __m128i v_multiplier =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
+                                                  const int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int h = 0; h < 4; ++h) {
+    for (int i = 0; i < 32; i += 8) {
+      const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
+      // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+      // saturating add here is ok.
+      const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+      StoreUnaligned16(&dst[h * step + i], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
+  dst[0] = _mm_extract_epi16(v_dst_0, 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const __m128i v_two = _mm_set1_epi16(2);
+
+  int i = 0;
+  do {
+    const int row = i * tx_width;
+    int j = 0;
+    do {
+      const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
+      const __m128i frame_data = LoadLo8(dst + j);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
+      const __m128i b = _mm_srai_epi16(a, 2);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      StoreLo8(dst + j, _mm_packus_epi16(d, d));
+      j += 8;
+    } while (j < tx_width);
+    dst += stride;
+  } while (++i < tx_height);
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
+                                       const int start_x, const int start_y,
+                                       const void* LIBGAV1_RESTRICT source,
+                                       const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int16_t*>(source);
+  __m128i s[4], x[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int16_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int16_t g = f >> 1;
+    f = f - (f >> 1);
+    const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int16_t i = (src[0] >> 4);
+    s[0] = _mm_set1_epi16(h);
+    s[0] = _mm_insert_epi16(s[0], f, 0);
+    s[1] = _mm_set1_epi16(i);
+    s[1] = _mm_insert_epi16(s[1], g, 0);
+    s[2] = s[3] = s[1];
+  } else {
+    x[0] = LoadLo8(&src[0 * 4]);
+    x[2] = LoadLo8(&src[1 * 4]);
+    x[3] = LoadLo8(&src[2 * 4]);
+    x[1] = LoadLo8(&src[3 * 4]);
+
+    // Row transforms.
+    Transpose4x4_U16(x, x);
+    s[0] = _mm_srai_epi16(x[0], 2);
+    s[2] = _mm_srai_epi16(x[1], 2);
+    s[3] = _mm_srai_epi16(x[2], 2);
+    s[1] = _mm_srai_epi16(x[3], 2);
+    s[0] = _mm_add_epi16(s[0], s[2]);
+    s[3] = _mm_sub_epi16(s[3], s[1]);
+    __m128i e = _mm_sub_epi16(s[0], s[3]);
+    e = _mm_srai_epi16(e, 1);
+    s[1] = _mm_sub_epi16(e, s[1]);
+    s[2] = _mm_sub_epi16(e, s[2]);
+    s[0] = _mm_sub_epi16(s[0], s[1]);
+    s[3] = _mm_add_epi16(s[3], s[2]);
+    Transpose4x4_U16(s, s);
+
+    // Column transforms.
+    s[0] = _mm_add_epi16(s[0], s[2]);
+    s[3] = _mm_sub_epi16(s[3], s[1]);
+    e = _mm_sub_epi16(s[0], s[3]);
+    e = _mm_srai_epi16(e, 1);
+    s[1] = _mm_sub_epi16(e, s[1]);
+    s[2] = _mm_sub_epi16(e, s[2]);
+    s[0] = _mm_sub_epi16(s[0], s[1]);
+    s[3] = _mm_add_epi16(s[3], s[2]);
+  }
+
+  // Store to frame.
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  for (int row = 0; row < 4; ++row) {
+    const __m128i frame_data = Load4(dst);
+    const __m128i a = _mm_cvtepu8_epi16(frame_data);
+    const __m128i b = _mm_add_epi16(a, s[row]);
+    Store4(dst, _mm_packus_epi16(b, b));
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source, TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const __m128i residual = LoadLo8(&source[row]);
+      const __m128i frame_data = Load4(dst);
+      // Saturate to prevent overflowing int16_t
+      const __m128i a = _mm_adds_epi16(residual, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    }
+  } else if (tx_width == 8) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+      const __m128i residual = LoadUnaligned16(&source[row]);
+      const __m128i frame_data = LoadLo8(dst);
+      // Saturate to prevent overflowing int16_t
+      const __m128i b = _mm_adds_epi16(residual, v_eight);
+      const __m128i c = _mm_srai_epi16(b, 4);
+      const __m128i d = _mm_cvtepu8_epi16(frame_data);
+      const __m128i e = _mm_adds_epi16(d, c);
+      StoreLo8(dst, _mm_packus_epi16(e, e));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const __m128i residual = LoadUnaligned16(&source[row + j]);
+        const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
+        const __m128i frame_data = LoadUnaligned16(frame[y] + x);
+        const __m128i b = _mm_adds_epi16(residual, v_eight);
+        const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
+        const __m128i c = _mm_srai_epi16(b, 4);
+        const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
+        const __m128i d = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
+        const __m128i e = _mm_adds_epi16(d, c);
+        const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
+        StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
+        j += 16;
+      } while (j < tx_width);
+    }
+  }
+}
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+  const __m128i word_reverse_8 =
+      _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // read 16 shorts
+      const __m128i v3210 = LoadUnaligned16(&source[i]);
+      const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
+      const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
+      const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
+      StoreUnaligned16(&source[i], v4567);
+      StoreUnaligned16(&source[i + 8], v0123);
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
+      StoreUnaligned16(&source[i], b);
+    }
+  } else {
+    const __m128i dual_word_reverse_4 =
+        _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
+      StoreUnaligned16(&source[i], b);
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+      StoreUnaligned16(&source[i], b);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      // The last 32 values of every row are always zero if the |tx_width| is
+      // 64.
+      const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+      int j = 0;
+      do {
+        const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
+        const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+        StoreUnaligned16(&source[i * tx_width + j], b);
+        j += 8;
+      } while (j < non_zero_width);
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+                                    int row_shift) {
+  const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const __m128i residual = LoadUnaligned16(&source[i]);
+      const __m128i shifted_residual =
+          ShiftResidual(residual, v_row_shift_add, v_row_shift);
+      StoreUnaligned16(&source[i], shifted_residual);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      for (int j = 0; j < tx_width; j += 8) {
+        const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
+        const __m128i shifted_residual =
+            ShiftResidual(residual, v_row_shift_add, v_row_shift);
+        StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
+      }
+    } while (++i < num_rows);
+  }
+}
+
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+                                             /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct4 columns in parallel.
+      Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
+                                              /*transpose=*/false);
+    } else {
+      // Process 8 1d dct4 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
+                                               /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
+}
+
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                              /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct8 columns in parallel.
+      Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct8 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
+}
+
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct16 columns in parallel.
+      Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      int i = 0;
+      do {
+        // Process 8 1d dct16 columns in parallel per iteration.
+        Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                 /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
+}
+
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct32 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+      i += 8;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
+}
+
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct64 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+      i += 8;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
+}
+
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+    i += 4;
+  } while (i < adjusted_tx_height);
+
+  if (row_shift != 0) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
+      i += 4;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 4, src, tx_type);
+}
+
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst8 columns in parallel.
+      Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d adst8 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                 /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 8, src, tx_type);
+}
+
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                   TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int /*start_x*/, int /*start_y*/,
+                                   void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                                /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height,
+                                      void* LIBGAV1_RESTRICT src_buffer,
+                                      int start_x, int start_y,
+                                      void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst16 columns in parallel.
+      Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      int i = 0;
+      do {
+        // Process 8 1d adst16 columns in parallel per iteration.
+        Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                  /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 16, src, tx_type);
+}
+
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  } else {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* LIBGAV1_RESTRICT src_buffer,
+                                         int start_x, int start_y,
+                                         void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                              adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = 0;
+    do {
+      Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+      i += 4;
+    } while (i < adjusted_tx_height);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = 0;
+  do {
+    Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* LIBGAV1_RESTRICT src_buffer,
+                                         int start_x, int start_y,
+                                         void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                     adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = 0;
+  do {
+    Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+                         kTransformRowShift[tx_size]);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* LIBGAV1_RESTRICT src_buffer,
+                                          int start_x, int start_y,
+                                          void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                      adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16. The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = 0;
+  do {
+    Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* LIBGAV1_RESTRICT src_buffer,
+                                          int start_x, int start_y,
+                                          void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                               adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+                                 int /*adjusted_tx_height*/,
+                                 void* /*src_buffer*/, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Do both row and column transforms in the column-transform pass.
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int16_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  // Maximum transform size for Dct is 64.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      Dct4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      Dct4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      Dct8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      Dct8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      Dct16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      Dct16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      Dct32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      Dct32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize64_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      Dct64TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      Dct64TransformLoopColumn_SSE4_1;
+#endif
+
+  // Maximum transform size for Adst is 16.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dAdst)
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      Adst4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      Adst4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dAdst)
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      Adst8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      Adst8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dAdst)
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      Adst16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      Adst16TransformLoopColumn_SSE4_1;
+#endif
+
+  // Maximum transform size for Identity transform is 32.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      Identity4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      Identity4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      Identity8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      Identity8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      Identity16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      Identity16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      Identity32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      Identity32TransformLoopColumn_SSE4_1;
+#endif
+
+  // Maximum transform size for Wht is 4.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dWht)
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      Wht4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      Wht4TransformLoopColumn_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h
new file mode 100644 (file)
index 0000000..c31e88b
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc
new file mode 100644 (file)
index 0000000..b9da2d5
--- /dev/null
@@ -0,0 +1,2252 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1,
+                              const __m128i& a2, const __m128i& s1,
+                              const __m128i& s2) {
+  __m128i x = _mm_add_epi16(a1, total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2);
+  return x;
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+                                  const __m128i& outer_thresh) {
+  const __m128i fe = _mm_set1_epi8(static_cast<int8_t>(0xfe));
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+  const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq);
+  const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1);
+  const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4));
+  return _mm_subs_epu8(c, outer_thresh);
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+                   const __m128i& hev_thresh) {
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq =
+      _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4));
+  const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq);
+  const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh);
+  const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1);
+  return hev_mask;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi8(a, b);
+  const __m128i d = _mm_unpacklo_epi8(c, c);
+  const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */
+  return _mm_packs_epi16(e, e);
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi8(a, b);
+  const __m128i d = _mm_unpacklo_epi8(c, c);
+  const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */
+  return _mm_packs_epi16(e, e);
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+                    __m128i* oqp0, const __m128i& mask, const __m128i& hev) {
+  const __m128i t80 = _mm_set1_epi8(static_cast<int8_t>(0x80));
+  const __m128i t1 = _mm_set1_epi8(0x1);
+  const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80);
+  const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09);
+  const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c);
+  const __m128i _hev = _mm_unpacklo_epi32(hev, hev);
+  const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0);
+  __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev);
+
+  a = _mm_adds_epi8(a, x);
+  a = _mm_adds_epi8(a, x);
+  a = _mm_adds_epi8(a, x);
+  a = _mm_and_si128(a, mask);
+  a = _mm_unpacklo_epi32(a, a);
+
+  const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303);
+  const __m128i a1a2 = AddShift3(a, t4t3);
+  const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55);
+  const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1));
+  // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1
+  const __m128i adjust_sign_for_add =
+      _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1));
+
+  const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3);
+  const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add);
+
+  const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2);
+  const __m128i c = _mm_xor_si128(b, t80);
+
+  *oqp0 = c;
+  *oqp1 = _mm_srli_si128(c, 8);
+}
+
+void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0);
+
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose4x4(const __m128i& x0, const __m128i& x1,
+                         const __m128i& x2, const __m128i& x3, __m128i* d0,
+                         __m128i* d1, __m128i* d2, __m128i* d3) {
+  // input
+  // x0   00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // output
+  // d0   00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d1   01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d2   02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d3   03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  *d0 = _mm_unpacklo_epi16(w0, w1);
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(*d0, 4);
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0, 8);
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0, 12);
+}
+
+void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = Load4(dst - 2 + 0 * stride);
+  __m128i x1 = Load4(dst - 2 + 1 * stride);
+  __m128i x2 = Load4(dst - 2 + 2 * stride);
+  __m128i x3 = Load4(dst - 2 + 3 * stride);
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i d0 = _mm_unpacklo_epi16(w0, w1);
+  const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc);
+  const __m128i qp0 = _mm_srli_si128(d0, 4);
+  const __m128i q1q0 = _mm_srli_si128(d0, 8);
+  const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i p1 = oqp1;
+  const __m128i p0 = oqp0;
+  const __m128i q0 = _mm_srli_si128(oqp0, 4);
+  const __m128i q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+  Store4(dst - 2 + 0 * stride, x0);
+  Store4(dst - 2 + 1 * stride, x1);
+  Store4(dst - 2 + 2 * stride, x2);
+  Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp2, const __m128i& qp1,
+                            const __m128i& qp0, const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+                       const __m128i& qp0, const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i flat_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+  return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+                    __m128i* oqp1, __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f6_lo =
+      _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p2 * 3 + p1 * 2 + p0 * 2 + q0
+  // q2 * 3 + q1 * 2 + q0 * 2 + p0
+  *oqp1 = _mm_srli_epi16(f6_lo, 3);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+  // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+  f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f6_lo, 3);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3,
+                              __m128i* d4, __m128i* d5, __m128i* d6,
+                              __m128i* d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d0 = ww0;
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0, 4);
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0, 8);
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0, 12);
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d4 = ww1;
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1, 4);
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1, 8);
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1, 12);
+}
+
+void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadLo8(dst - 3 + 0 * stride);
+  __m128i x1 = LoadLo8(dst - 3 + 1 * stride);
+  __m128i x2 = LoadLo8(dst - 3 + 2 * stride);
+  __m128i x3 = LoadLo8(dst - 3 + 3 * stride);
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i z0, z1;  // not used
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 4);
+  q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+  Store4(dst - 2 + 0 * stride, x0);
+  Store4(dst - 2 + 1 * stride, x1);
+  Store4(dst - 2 + 2 * stride, x2);
+  Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp3, const __m128i& qp2,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+  const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+                       const __m128i& qp1, const __m128i& qp0,
+                       const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+  const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0);
+  const __m128i flat_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+  return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                    const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+                    __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f8_lo =
+      _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+  // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+  *oqp2 = _mm_srli_epi16(f8_lo, 3);
+  *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+  // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+  // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+  f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+  *oqp1 = _mm_srli_epi16(f8_lo, 3);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+  // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+  f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+  *oqp0 = _mm_srli_epi16(f8_lo, 3);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p3 = Load4(dst - 4 * stride);
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i q3 = Load4(dst + 3 * stride);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+    Store4(dst - 3 * stride, oqp2_f8);
+    Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3,
+                              const __m128i& x4, const __m128i& x5,
+                              const __m128i& x6, const __m128i& x7, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  const __m128i w2 = _mm_unpacklo_epi8(x4, x5);
+  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+  const __m128i w3 = _mm_unpacklo_epi8(x6, x7);
+
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+
+  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d0 = _mm_unpacklo_epi32(w4, w5);
+  *d1 = _mm_srli_si128(*d0, 8);
+  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d2 = _mm_unpackhi_epi32(w4, w5);
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadLo8(dst - 4 + 0 * stride);
+  __m128i x1 = LoadLo8(dst - 4 + 1 * stride);
+  __m128i x2 = LoadLo8(dst - 4 + 2 * stride);
+  __m128i x3 = LoadLo8(dst - 4 + 3 * stride);
+
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    p2 = oqp2_f8;
+    q2 = _mm_srli_si128(oqp2_f8, 4);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 4);
+  q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+  StoreLo8(dst - 4 + 0 * stride, x0);
+  StoreLo8(dst - 4 + 1 * stride, x1);
+  StoreLo8(dst - 4 + 2 * stride, x2);
+  StoreLo8(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+                     const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                     const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+                     __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+                     __m128i* oqp0) {
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6);
+  const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5);
+  const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4);
+  const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+  const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+  const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f14_lo =
+      _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+                         _mm_add_epi16(qp5_lo, qp4_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+                         _mm_add_epi16(qp3_lo, qp2_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+                         _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+  // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+  *oqp5 = _mm_srli_epi16(f14_lo, 4);
+  *oqp5 = _mm_packus_epi16(*oqp5, *oqp5);
+
+  // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+  // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+  f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+  *oqp4 = _mm_srli_epi16(f14_lo, 4);
+  *oqp4 = _mm_packus_epi16(*oqp4, *oqp4);
+
+  // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+  // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+  f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+  *oqp3 = _mm_srli_epi16(f14_lo, 4);
+  *oqp3 = _mm_packus_epi16(*oqp3, *oqp3);
+
+  // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+  // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+  f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+  *oqp2 = _mm_srli_epi16(f14_lo, 4);
+  *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+  // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+  // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+  f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+  *oqp1 = _mm_srli_epi16(f14_lo, 4);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+  // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+  f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f14_lo, 4);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                  int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p3 = Load4(dst - 4 * stride);
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i q3 = Load4(dst + 3 * stride);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i p6 = Load4(dst - 7 * stride);
+    const __m128i p5 = Load4(dst - 6 * stride);
+    const __m128i p4 = Load4(dst - 5 * stride);
+    const __m128i q4 = Load4(dst + 4 * stride);
+    const __m128i q5 = Load4(dst + 5 * stride);
+    const __m128i q6 = Load4(dst + 6 * stride);
+    const __m128i qp6 = _mm_unpacklo_epi32(p6, q6);
+    const __m128i qp5 = _mm_unpacklo_epi32(p5, q5);
+    const __m128i qp4 = _mm_unpacklo_epi32(p4, q4);
+
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask =
+        _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+      Store4(dst - 6 * stride, oqp5_f14);
+      Store4(dst - 5 * stride, oqp4_f14);
+      Store4(dst - 4 * stride, oqp3_f14);
+      Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4));
+      Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4));
+      Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4));
+    }
+
+    Store4(dst - 3 * stride, oqp2_f8);
+    Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+// Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8,
+// then unpacked to the correct qp register. (qp7 - qp0)
+//
+// p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+//
+// 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
+// 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
+// 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
+// 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f
+
+inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                                  const __m128i& x2, const __m128i& x3,
+                                  __m128i* q0p0, __m128i* q1p1, __m128i* q2p2,
+                                  __m128i* q3p3, __m128i* q4p4, __m128i* q5p5,
+                                  __m128i* q6p6, __m128i* q7p7) {
+  // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 08 18 09 19 0a 1a 0b 1b  0c 1c 0d 1d 0e 1e 0f 1f
+  const __m128i w2 = _mm_unpackhi_epi8(x0, x1);
+  // 28 38 29 39 2a 3a 2b 3b  2c 3c 2d 3d 2e 3e 2f 3f
+  const __m128i w3 = _mm_unpackhi_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+  // 08 18 28 38 09 19 29 39  0a 1a 2a 3a 0b 1b 2b 3b
+  const __m128i ww2 = _mm_unpacklo_epi16(w2, w3);
+  // 0c 1c 2c 3c 0d 1d 2d 3d  0e 1e 2e 3e 0f 1f 2f 3f
+  const __m128i ww3 = _mm_unpackhi_epi16(w2, w3);
+  // 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
+  *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12));
+  // 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3);
+  // 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4));
+  // 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
+  *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3);
+  // 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12));
+  // 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2);
+  // 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4));
+  // 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2);
+}
+
+inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6,
+                                  const __m128i& qp5, const __m128i& qp4,
+                                  const __m128i& qp3, const __m128i& qp2,
+                                  const __m128i& qp1, const __m128i& qp0,
+                                  __m128i* x0, __m128i* x1, __m128i* x2,
+                                  __m128i* x3) {
+  // qp7: 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
+  // qp6: 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
+  // qp5: 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
+  // qp4: 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
+  // qp3: 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
+  // qp2: 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
+  // qp1: 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
+  // qp0: 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx
+
+  // 00 01 10 11 20 21 30 31  0f 0e 1f 1e 2f 2e 3f 3e
+  const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6);
+  // 02 03 12 13 22 23 32 33  xx xx xx xx xx xx xx xx
+  const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4);
+  // 04 05 14 15 24 25 34 35  xx xx xx xx xx xx xx xx
+  const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2);
+  // 06 07 16 17 26 27 36 37  xx xx xx xx xx xx xx xx
+  const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0);
+  // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+  const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+  // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37
+  const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i d0 = _mm_unpacklo_epi32(w4, w5);
+  // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+  const __m128i d2 = _mm_unpackhi_epi32(w4, w5);
+  // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39
+  const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1);
+  // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b
+  const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3);
+  // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d
+  const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5);
+  // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f
+  const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7);
+  // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b
+  const __m128i w14 = _mm_unpackhi_epi16(w10, w11);
+  // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f
+  const __m128i w15 = _mm_unpackhi_epi16(w12, w13);
+  // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f
+  const __m128i d1 = _mm_unpacklo_epi32(w14, w15);
+  // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f
+  const __m128i d3 = _mm_unpackhi_epi32(w14, w15);
+
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  //
+  // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+  *x0 = _mm_unpacklo_epi64(d0, d1);
+  // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+  *x1 = _mm_unpackhi_epi64(d0, d1);
+  // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+  *x2 = _mm_unpacklo_epi64(d2, d3);
+  // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+  *x3 = _mm_unpackhi_epi64(d2, d3);
+}
+
+void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+  __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0;
+
+  DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5,
+                        &qp6, &qp7);
+
+  const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d);
+  const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask =
+        _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+      qp3 = oqp3_f14;
+      qp4 = oqp4_f14;
+      qp5 = oqp5_f14;
+    }
+    qp2 = oqp2_f8;
+  }
+
+  DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2,
+                        &x3);
+
+  StoreUnaligned16(dst - 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 3 * stride, x3);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14;
+#endif
+}
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+namespace high_bitdepth {
+namespace {
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+struct LoopFilterFuncs_SSE4_1 {
+  LoopFilterFuncs_SSE4_1() = delete;
+
+  static constexpr int kThreshShift = bitdepth - 8;
+
+  static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                         int inner_thresh, int hev_thresh);
+  static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                           int inner_thresh, int hev_thresh);
+};
+
+inline __m128i Clamp(const __m128i& min, const __m128i& max,
+                     const __m128i& val) {
+  const __m128i a = _mm_min_epi16(val, max);
+  const __m128i b = _mm_max_epi16(a, min);
+  return b;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b,
+                         const __m128i& vmin, const __m128i& vmax) {
+  const __m128i c = _mm_adds_epi16(a, b);
+  const __m128i d = Clamp(vmin, vmax, c);
+  const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */
+  return e;
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi16(a, b);
+  const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */
+  return e;
+}
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+                   const __m128i& hev_thresh) {
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq =
+      _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+  const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh);
+  return hev_mask;
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+                                  const __m128i& outer_thresh) {
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+  const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq);
+  const __m128i b = _mm_srli_epi16(abs_pmq, 1);
+  const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8));
+  return _mm_subs_epu16(c, outer_thresh);
+}
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_abs_qp1mqp =
+      _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+  const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+                    __m128i* oqp0, const __m128i& mask, const __m128i& hev,
+                    int bitdepth) {
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_set1_epi16(static_cast<int16_t>(1 << (bitdepth - 1)));
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80);
+  const __m128i vmax = _mm_subs_epi16(t80, t1);
+  const __m128i ps1 = _mm_subs_epi16(qp1, t80);
+  const __m128i ps0 = _mm_subs_epi16(qp0, t80);
+  const __m128i qs0 = _mm_srli_si128(ps0, 8);
+  const __m128i qs1 = _mm_srli_si128(ps1, 8);
+
+  __m128i a = _mm_subs_epi16(ps1, qs1);
+  a = _mm_and_si128(Clamp(vmin, vmax, a), hev);
+
+  const __m128i x = _mm_subs_epi16(qs0, ps0);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_and_si128(Clamp(vmin, vmax, a), mask);
+
+  const __m128i a1 = AddShift3(a, t4, vmin, vmax);
+  const __m128i a2 = AddShift3(a, t3, vmin, vmax);
+  const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1));
+
+  const __m128i ops1 = _mm_adds_epi16(ps1, a3);
+  const __m128i ops0 = _mm_adds_epi16(ps0, a2);
+  const __m128i oqs0 = _mm_subs_epi16(qs0, a1);
+  const __m128i oqs1 = _mm_subs_epi16(qs1, a3);
+
+  __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1);
+  __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0);
+
+  oqps1 = Clamp(vmin, vmax, oqps1);
+  oqps0 = Clamp(vmin, vmax, oqps0);
+
+  *oqp1 = _mm_adds_epi16(oqps1, t80);
+  *oqp0 = _mm_adds_epi16(oqps0, t80);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal4(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i qp0 = LoadHi8(p0, dst + 0 * stride);
+  const __m128i qp1 = LoadHi8(p1, dst + 1 * stride);
+  const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+  const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical4(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+  const __m128i x0 = LoadLo8(dst - 2 + 0 * stride);
+  const __m128i x1 = LoadLo8(dst - 2 + 1 * stride);
+  const __m128i x2 = LoadLo8(dst - 2 + 2 * stride);
+  const __m128i x3 = LoadLo8(dst - 2 + 3 * stride);
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 00 10 20 30 01 11 21 31   p0p1
+  const __m128i a = _mm_unpacklo_epi32(w0, w1);
+  const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e);
+  // 02 12 22 32 03 13 23 33   q1q0
+  const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1);
+  const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0);
+  const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0);
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+  // 00 10 20 30 01 11 21 31
+  const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+  StoreLo8(dst - 2 + 0 * stride, op0p1);
+  StoreHi8(dst - 2 + 1 * stride, op0p1);
+  StoreLo8(dst - 2 + 2 * stride, oq1q0);
+  StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0,
+                                  const __m128i& outer_thresh) {
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+  const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+  return CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+}
+
+inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1,
+                            const __m128i& qp0, const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i inner_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+                       const __m128i& qp0, const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i flat_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+  return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+                    __m128i* oqp1, __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f6_lo;
+  f6_lo =
+      _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p2 * 3 + p1 * 2 + p0 * 2 + q0
+  // q2 * 3 + q1 * 2 + q0 * 2 + p0
+  *oqp1 = _mm_srli_epi16(f6_lo, 3);
+
+  // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+  // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+  f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f6_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3,
+                              __m128i* d4, __m128i* d5, __m128i* d6,
+                              __m128i* d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07
+  // x1   10 11 12 13 14 15 16 17
+  // x2   20 21 22 23 24 25 26 27
+  // x3   30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 04 14 05 15 06 16 07 17
+  const __m128i w2 = _mm_unpackhi_epi16(x0, x1);
+  // 24 34 25 35 26 36 27 37
+  const __m128i w3 = _mm_unpackhi_epi16(x2, x3);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i ww0 = _mm_unpacklo_epi32(w0, w1);
+  // 04 14 24 34 05 15 25 35
+  const __m128i ww1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i ww2 = _mm_unpackhi_epi32(w0, w1);
+  // 06 16 26 36 07 17 27 37
+  const __m128i ww3 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 xx xx xx xx
+  *d0 = ww0;
+  // 01 11 21 31 xx xx xx xx
+  *d1 = _mm_srli_si128(ww0, 8);
+  // 02 12 22 32 xx xx xx xx
+  *d2 = ww2;
+  // 03 13 23 33 xx xx xx xx
+  *d3 = _mm_srli_si128(ww2, 8);
+  // 04 14 24 34 xx xx xx xx
+  *d4 = ww1;
+  // 05 15 25 35 xx xx xx xx
+  *d5 = _mm_srli_si128(ww1, 8);
+  // 06 16 26 36 xx xx xx xx
+  *d6 = ww3;
+  // 07 17 27 37 xx xx xx xx
+  *d7 = _mm_srli_si128(ww3, 8);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride);
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i z0, z1;  // not used
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+  // 00 10 20 30 01 11 21 31
+  const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+  StoreLo8(dst - 2 + 0 * stride, op0p1);
+  StoreHi8(dst - 2 + 1 * stride, op0p1);
+  StoreLo8(dst - 2 + 2 * stride, oq1q0);
+  StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+  const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2);
+  const __m128i inner_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+                       const __m128i& qp1, const __m128i& qp0,
+                       const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+  const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0);
+  const __m128i flat_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+  return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                    const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+                    __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp3_lo = qp3;
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f8_lo =
+      _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+  // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+  *oqp2 = _mm_srli_epi16(f8_lo, 3);
+
+  // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+  // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+  f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+  *oqp1 = _mm_srli_epi16(f8_lo, 3);
+
+  // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+  // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+  f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+  *oqp0 = _mm_srli_epi16(f8_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p3 = LoadLo8(dst - 4 * stride);
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+  const __m128i q3 = LoadLo8(dst + 3 * stride);
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+    StoreLo8(dst - 3 * stride, oqp2_f8);
+    StoreHi8(dst + 2 * stride, oqp2_f8);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1,
+                                   const __m128i& x2, const __m128i& x3,
+                                   const __m128i& x4, const __m128i& x5,
+                                   const __m128i& x6, const __m128i& x7,
+                                   __m128i* d0, __m128i* d1, __m128i* d2,
+                                   __m128i* d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70
+  // d1 01 11 21 31 41 51 61 71
+  // d2 02 12 22 32 42 52 62 72
+  // d3 03 13 23 33 43 53 63 73
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 40 50 41 51 42 52 43 53
+  const __m128i w2 = _mm_unpacklo_epi16(x4, x5);
+  // 60 70 61 71 62 72 63 73
+  const __m128i w3 = _mm_unpacklo_epi16(x6, x7);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+  // 40 50 60 70 41 51 61 71
+  const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+  // 42 52 62 72 43 53 63 73
+  const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 40 50 60 70
+  *d0 = _mm_unpacklo_epi64(w4, w5);
+  // 01 11 21 31 41 51 61 71
+  *d1 = _mm_unpackhi_epi64(w4, w5);
+  // 02 12 22 32 42 52 62 72
+  *d2 = _mm_unpacklo_epi64(w6, w7);
+  // 03 13 23 33 43 53 63 73
+  *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride);
+
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    p2 = oqp2_f8;
+    q2 = _mm_srli_si128(oqp2_f8, 8);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 8);
+  q1 = _mm_srli_si128(oqp1, 8);
+
+  TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+  StoreUnaligned16(dst - 4 + 0 * stride, x0);
+  StoreUnaligned16(dst - 4 + 1 * stride, x1);
+  StoreUnaligned16(dst - 4 + 2 * stride, x2);
+  StoreUnaligned16(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+                     const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                     const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+                     __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+                     __m128i* oqp0) {
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i qp6_lo = qp6;
+  const __m128i qp5_lo = qp5;
+  const __m128i qp4_lo = qp4;
+  const __m128i qp3_lo = qp3;
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+  const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+  const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f14_lo =
+      _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+                         _mm_add_epi16(qp5_lo, qp4_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+                         _mm_add_epi16(qp3_lo, qp2_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+                         _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+  // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+  *oqp5 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+  // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+  f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+  *oqp4 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+  // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+  f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+  *oqp3 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+  // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+  f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+  *oqp2 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+  // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+  f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+  *oqp1 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+  // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+  f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f14_lo, 4);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
+                                                    ptrdiff_t stride8,
+                                                    int outer_thresh,
+                                                    int inner_thresh,
+                                                    int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p3 = LoadLo8(dst - 4 * stride);
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+  const __m128i q3 = LoadLo8(dst + 3 * stride);
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i p6 = LoadLo8(dst - 7 * stride);
+    const __m128i p5 = LoadLo8(dst - 6 * stride);
+    const __m128i p4 = LoadLo8(dst - 5 * stride);
+    const __m128i q4 = LoadLo8(dst + 4 * stride);
+    const __m128i q5 = LoadLo8(dst + 5 * stride);
+    const __m128i q6 = LoadLo8(dst + 6 * stride);
+    const __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+    const __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+    const __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+    const __m128i v_flat4_mask =
+        _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+      StoreLo8(dst - 6 * stride, oqp5_f14);
+      StoreLo8(dst - 5 * stride, oqp4_f14);
+      StoreLo8(dst - 4 * stride, oqp3_f14);
+
+      StoreHi8(dst + 3 * stride, oqp3_f14);
+      StoreHi8(dst + 4 * stride, oqp4_f14);
+      StoreHi8(dst + 5 * stride, oqp5_f14);
+    }
+
+    StoreLo8(dst - 3 * stride, oqp2_f8);
+    StoreHi8(dst + 2 * stride, oqp2_f8);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1,
+                                   const __m128i& x2, const __m128i& x3,
+                                   const __m128i& x4, const __m128i& x5,
+                                   const __m128i& x6, const __m128i& x7,
+                                   __m128i* d0, __m128i* d1, __m128i* d2,
+                                   __m128i* d3) {
+  // input
+  // x0 00 01 02 03 xx xx xx xx
+  // x1 10 11 12 13 xx xx xx xx
+  // x2 20 21 22 23 xx xx xx xx
+  // x3 30 31 32 33 xx xx xx xx
+  // x4 40 41 42 43 xx xx xx xx
+  // x5 50 51 52 53 xx xx xx xx
+  // x6 60 61 62 63 xx xx xx xx
+  // x7 70 71 72 73 xx xx xx xx
+  // output
+  // d0 00 10 20 30 40 50 60 70
+  // d1 01 11 21 31 41 51 61 71
+  // d2 02 12 22 32 42 52 62 72
+  // d3 03 13 23 33 43 53 63 73
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpackhi_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpackhi_epi16(x2, x3);
+  // 40 50 41 51 42 52 43 53
+  const __m128i w2 = _mm_unpackhi_epi16(x4, x5);
+  // 60 70 61 71 62 72 63 73
+  const __m128i w3 = _mm_unpackhi_epi16(x6, x7);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+  // 40 50 60 70 41 51 61 71
+  const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+  // 42 52 62 72 43 53 63 73
+  const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 40 50 60 70
+  *d0 = _mm_unpacklo_epi64(w4, w5);
+  // 01 11 21 31 41 51 61 71
+  *d1 = _mm_unpackhi_epi64(w4, w5);
+  // 02 12 22 32 42 52 62 72
+  *d2 = _mm_unpacklo_epi64(w6, w7);
+  // 03 13 23 33 43 53 63 73
+  *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
+                                                  int outer_thresh,
+                                                  int inner_thresh,
+                                                  int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  //
+  // 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
+  // 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
+  // 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
+  // 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f
+
+  __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0;
+  __m128i q7, q6, q5, q4, q3, q2, q1, q0;
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+
+  x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride);
+  x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride);
+  x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride);
+  x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride);
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+
+  __m128i qp7 = _mm_unpacklo_epi64(p7, q7);
+  __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+  __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+  __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+  __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+    const __m128i v_flat4_mask =
+        _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+      qp3 = oqp3_f14;
+      qp4 = oqp4_f14;
+      qp5 = oqp5_f14;
+    }
+    qp2 = oqp2_f8;
+  }
+
+  TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1,
+                         &x2, &x3);
+
+  StoreUnaligned16(dst - 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 3 * stride, x3);
+
+  TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1,
+                         &x2, &x3);
+
+  StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3);
+}
+
+using Defs10bpp = LoopFilterFuncs_SSE4_1<kBitdepth10>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#endif
+}
+#endif
+}  // namespace
+}  // namespace high_bitdepth
+
+void LoopFilterInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_filter_sse4.h b/src/dsp/x86/loop_filter_sse4.h
new file mode 100644 (file)
index 0000000..4795d8b
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
new file mode 100644 (file)
index 0000000..daf5c42
--- /dev/null
@@ -0,0 +1,3163 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m256i sum0 = _mm256_add_epi32(s[0], round);
+  const __m256i sum1 = _mm256_add_epi32(s[1], round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum1 =
+      _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+  const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+  const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+  const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+  const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+  __m256i madds[4];
+  madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+  madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+  madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+  madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+  madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+  const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+  const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+  const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+  const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+  const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+  madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+  madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+  filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[7];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      s[5] = LoadUnaligned32(src + x + 5);
+      s[6] = LoadUnaligned32(src + x + 6);
+      WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m256i filter =
+      _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[5];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[3];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s0 = LoadUnaligned32(src + x);
+      const __m256i d0 = _mm256_slli_epi16(s0, 4);
+      StoreAligned32(*wiener_buffer + x, d0);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+  const __m256i sum = _mm256_add_epi32(madd01, madd23);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i sum = _mm256_add_epi32(madd01, madd2);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+  const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+  return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[4], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm256_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm256_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[3]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[3], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[3], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[3], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[3] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[2] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m256i filter[2];
+  filter[0] =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+      StoreUnaligned32(dst + x, d[0][0]);
+      StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m256i a = LoadAligned32(wiener_buffer);
+  const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+  const __m256i c = _mm256_srai_epi16(b, 4);
+  const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+  const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 4;
+constexpr int kOverreadInBytesPass2_128 = 8;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17],
+// ..., ma[23].
+// There is an 8 pixel gap between the first half and the second half.
+constexpr int kMaStoreOffset = 8;
+
+inline void StoreAligned32_ma(uint16_t* src, const __m256i v) {
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1));
+}
+
+inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) {
+  // The next 4 lines are much faster than:
+  // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20));
+  // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31));
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0));
+  StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1));
+  StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1));
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrU16(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1)));
+  return _mm256_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+inline void Square(const __m256i src, __m256i dst[2]) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
+  dst[0] = _mm256_madd_epi16(s0, s0);
+  dst[1] = _mm256_madd_epi16(s1, s1);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_32(const __m256i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline __m256i Sum5_32(const __m256i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline __m256i Sum3Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  return Sum3_16(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline __m256i Sum5Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  return Sum5_16(s);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3, __m256i* const row5) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  const __m256i sum04 = _mm256_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm256_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3_0, __m256i* const row3_1,
+                            __m256i* const row5_0, __m256i* const row5_1) {
+  SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0);
+  SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1);
+}
+
+inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                            __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3,
+                            __m256i* const row_sq5) {
+  const __m256i sum04 = _mm256_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0,
+                            __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                            __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return _mm256_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return _mm256_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass1_128 - sizeof(*src) * width;
+  const ptrdiff_t overread_in_bytes_256 =
+      kOverreadInBytesPass1_256 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2];
+    __m256i sq[8];
+    s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+    s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s0[0], sq_128 + 0);
+    Square(s0[1], sq_128 + 2);
+    SumHorizontal16(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      SumHorizontal16(
+          src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8),
+          &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int overread_in_bytes_128, overread_in_bytes_256;
+  if (size == 3) {
+    overread_in_bytes_128 = kOverreadInBytesPass2_128;
+    overread_in_bytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    overread_in_bytes_128 = kOverreadInBytesPass1_128;
+    overread_in_bytes_256 = kOverreadInBytesPass1_256;
+  }
+  overread_in_bytes_128 -= sizeof(*src) * width;
+  overread_in_bytes_256 -= sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s_128[2], ss, sq_128[4], sqs[2];
+    __m256i sq[8];
+    s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128);
+    s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s_128[0], sq_128 + 0);
+    Square(s_128[1], sq_128 + 2);
+    if (size == 3) {
+      ss = Sum3Horizontal16(s_128);
+      Sum3Horizontal32(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal16(s_128);
+      Sum5Horizontal32(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row[2], row_sq[4];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum3Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 4, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum5Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 4, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i b = VrshrU16(sum, 2);
+  const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b0[2], __m256i b1[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);  // 0 2 1 3
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x63);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  __m256i sums[2];
+  sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20);
+  sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31);
+  if (n == 9) {
+    CalculateB3(sums[0], maq0, b0);
+    CalculateB3(sums[1], maq1, b1);
+  } else {
+    CalculateB5(sums[0], maq0, b0);
+    CalculateB5(sums[1], maq1, b1);
+  }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[3], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]);
+  StoreAligned64(b444 + x, sum_b444);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+// Don't combine the following 2 functions, which would be slower.
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi,
+                         __m256i* const sum_ma444_lo,
+                         __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], __m256i sum_b444_lo[2],
+                         __m256i sum_b444_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_mat343[2], sum_mat444[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20);
+  *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31);
+  StoreAligned32(ma444 + x + 0, *sum_ma444_lo);
+  StoreAligned32(ma444 + x + 16, *sum_ma444_hi);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444);
+}
+
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_ma444[2], sum_b444[2], sum_mat343[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  StoreAligned64_ma(ma444 + x, sum_ma444);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444);
+}
+
+inline void PermuteB(const __m256i t[4], __m256i b[7]) {
+  // Input:
+  //                             0     1      2     3  // b[0]
+  //                             4     5      6     7  // b[1]
+  //  8     9     10    11      24    25     26    27  // t[0]
+  // 12    13     14    15      28    29     30    31  // t[1]
+  // 16    17     18    19      32    33     34    35  // t[2]
+  // 20    21     22    23      36    37     38    39  // t[3]
+
+  // Output:
+  //  0     1      2     3       8     9     10    11  // b[0]
+  //  4     5      6     7      12    13     14    15  // b[1]
+  //  8     9     10    11      16    17     18    19  // b[2]
+  // 16    17     18    19      24    25     26    27  // b[3]
+  // 20    21     22    23      28    29     30    31  // b[4]
+  // 24    25     26    27      32    33     34    35  // b[5]
+  // 20    21     22    23      36    37     38    39  // b[6]
+  b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21);
+  b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21);
+  b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20);
+  b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30);
+  b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30);
+  b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31);
+  b[6] = t[3];
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3],
+    __m256i b[3]) {
+  __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32);
+  s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0);
+  s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8],
+    __m256i ma[3], __m256i b[7]) {
+  __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s[0], sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s3[2] = Sum3Horizontal16(src, over_read_in_bytes);
+  s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[1], sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3],
+    __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3],
+    __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) {
+  __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2],
+      index_3[2][2], sum_5[2], index_5[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                  &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2);
+  PermuteB(t, b3[0]);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2);
+  PermuteB(t, b3[1]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2);
+  PermuteB(t, b3);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[10];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 3, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src) * width;
+  __m128i s[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(
+        src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width),
+        x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64_ma(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 3, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10],
+      b5_128[10];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_128[0], b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64_ma(ma343[0] + x, ma);
+    Sum343(b3[0], b);
+    Sum343(b3[0] + 3, b + 2);
+    StoreAligned64(b343[0] + x, b);
+    StoreAligned64(b343[0] + x + 16, b + 2);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444,
+                   b343[1], b444);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(b5, b);
+    StoreAligned64(b565, b);
+    Sum565(b5 + 3, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m256i val) {
+  const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256());
+  const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[1]);
+    StoreAligned32(ma565[1] + x + 16, ma[3]);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    StoreAligned64(b565[1] + x, b[1]);
+    StoreAligned64(b565[1] + x + 16, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0);
+    const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    ClipAndStore(dst + stride + x + 16, d11);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma0[2], sq_128[8], b0[6];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0],
+                                b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0[0], ma0[0]);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    LoadAligned64(b565 + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma565 + x + 16);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    LoadAligned64(b565 + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src0) * width;
+  __m128i s0[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0);
+  s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16);
+  Square(s0[0], sq_128);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(
+        src0 + x + 8,
+        kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1],
+                 b343[2], b444[1]);
+    const __m256i sr_lo = LoadUnaligned32(src + x + 0);
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3],
+        ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1],
+                 &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2],
+                 ma444[1], b343[2], b444[1]);
+    Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2],
+                 ma343[3], ma444[2], b343[3], b444[2]);
+
+    ma[0][2] = Sum565Lo(ma5x);
+    ma[0][3] = Sum565Hi(ma5x);
+    ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20);
+    ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[0][1]);
+    StoreAligned32(ma565[1] + x + 16, ma[0][3]);
+    Sum565(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x);
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    // Keeping the following 4 redundant lines is faster. The reason is that
+    // there are not enough registers available, and these values could be saved
+    // and loaded which is even slower.
+    ma[1][2] = LoadAligned32(ma343[2] + x);  // Redundant line 1.
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    ma[2][1] = LoadAligned32(ma444[1] + x);  // Redundant line 2.
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ClipAndStore(dst + x, d00);
+    const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+    ClipAndStore(dst + stride + x, d10x);
+
+    Sum565(b5 + 3, bt[0][1]);
+    StoreAligned64(b565[1] + x + 16, bt[0][1]);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    ma[0][2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, bt[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    mat[1][2] = LoadAligned32(ma343[2] + x + 16);  // Redundant line 3.
+    LoadAligned64(b343[0] + x + 16, bt[1][0]);
+    LoadAligned64(b444[0] + x + 16, bt[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    mat[2][1] = LoadAligned32(ma444[1] + x + 16);  // Redundant line 4.
+    LoadAligned64(b343[1] + x + 16, bt[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 16, d01);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 16, d11);
+
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2];
+  __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, b3_128, b5_128);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2];
+    BoxFilterPreProcessLastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3,
+        b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[2] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    mat[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 3, bt[1]);
+    ma[3] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 3, bt[2]);
+
+    const __m256i sr_lo = LoadUnaligned32(src + x);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20);
+    mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31);
+    LoadAligned64(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, bt[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31);
+    LoadAligned64(b343 + x + 16, bt[0]);
+    LoadAligned64(b444 + x + 16, bt[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[5];
+    b3[1] = b3[6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
new file mode 100644 (file)
index 0000000..029e168
--- /dev/null
@@ -0,0 +1,2543 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m128i offsets = _mm_set1_epi16(-offset);
+  const __m128i limits = _mm_set1_epi16(limit - offset);
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m128i sum0 = _mm_add_epi32(s[0], round);
+  const __m128i sum1 = _mm_add_epi32(s[1], round);
+  const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+  const __m128i d1 = _mm_min_epi16(d0, limits);
+  StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+  filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[7], madds[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+      const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+      const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+      const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+      const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+      const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+      madds[0] = _mm_madd_epi16(ss0, filter[0]);
+      madds[1] = _mm_madd_epi16(ss1, filter[0]);
+      madds[2] = _mm_madd_epi16(ss2, filter[1]);
+      madds[3] = _mm_madd_epi16(ss3, filter[1]);
+      madds[0] = _mm_add_epi32(madds[0], madds[2]);
+      madds[1] = _mm_add_epi32(madds[1], madds[3]);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i filter =
+      _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[5], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+      const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+      const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+      const __m128i s0m = _mm_sub_epi16(s04, s2d);
+      const __m128i s1m = _mm_sub_epi16(s13, s2d);
+      const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+      const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+      const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+      madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+      madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[3], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+      const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m128i s = LoadUnaligned16(src + x);
+      const __m128i d = _mm_slli_epi16(s, 4);
+      StoreAligned16(*wiener_buffer + x, d);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+  const __m128i sum = _mm_add_epi32(madd01, madd23);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i sum = _mm_add_epi32(madd01, madd2);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum = _mm_add_epi32(madd0, madd1);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+  const __m128i d = _mm_packus_epi32(s[0], s[1]);
+  return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[4], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+                                     const __m128i filter[3]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[3], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[2], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[7]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[3], __m128i a[5]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[3]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[4];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi32(c, 0x55);
+  filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[3] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[8], d[2];
+      d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+      d[1] = WienerVerticalFilter7(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[7];
+      const __m128i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[3];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[2] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[6], d[2];
+      d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+      d[1] = WienerVerticalFilter5(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[5];
+      const __m128i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m128i filter[2];
+  filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[4], d[2];
+      d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+      d[1] = WienerVerticalFilter3(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[3];
+      const __m128i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m128i a = LoadAligned16(wiener_buffer);
+  const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+  const __m128i c = _mm_srai_epi16(b, 4);
+  const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+  const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+  StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i coefficients_horizontal =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+  const __m128i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
+                            __m128i* const row3_1, __m128i* const row5_0,
+                            __m128i* const row5_1) {
+  SumHorizontal16(src + 0, row3_0, row5_0);
+  SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return _mm_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return _mm_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(s + 0);
+        row[1] = Sum3Horizontal16(s + 1);
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 2, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(s + 0);
+        row[1] = Sum5Horizontal16(s + 1);
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 2, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  // offset == 0 is assumed to be the first call to this function. The value is
+  // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
+  // equivalent if not faster than pinsrb.
+  if (offset == 0) {
+    *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  } else {
+    *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  }
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[4]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
+  StoreAligned32U32(b444 + x, sum_b444);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  s5[0][3] = Sum5Horizontal16(s[0] + 1);
+  s5[1][3] = Sum5Horizontal16(s[0] + 2);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  s5[0][4] = Sum5Horizontal16(s[1] + 1);
+  s5[1][4] = Sum5Horizontal16(s[1] + 2);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5Horizontal32(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[2], sq + 4);
+  s5[0][3] = Sum5Horizontal16(s + 1);
+  s5[1][3] = Sum5Horizontal16(s + 2);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[3], sq + 6);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  s3[2] = Sum3Horizontal16(s + 1);
+  s3[3] = Sum3Horizontal16(s + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src + x + 16,
+                               overread_in_bytes + sizeof(*src) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src + x + 24,
+                               overread_in_bytes + sizeof(*src) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 2, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343(b3[0] + 0, b + 0);
+    Sum343(b3[0] + 2, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565(b5 + 0, b + 0);
+    Sum565(b5 + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m128i v = _mm_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+                                       const __m128i b[2]) {
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
+  const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+  __m128i b_sum[2];
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
+  __m128i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+  const __m128i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+  return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
+  __m128i v[2];
+  const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+  const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m128i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
+  const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
+  const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
+  StoreAligned16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2], p[2];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    const __m128i sr0_lo = LoadAligned16(src + x + 0);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565(bs + 2, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565(bs + 2, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0_lo = LoadAligned16(src + x);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x);
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565(b5 + 2, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], b3, b5);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 2, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 2, b[2]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[4];
+    b3[1] = b3[5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
new file mode 100644 (file)
index 0000000..30e8a22
--- /dev/null
@@ -0,0 +1,2947 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+  // The sum range here is [-128 * 255, 90 * 255].
+  const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+  const __m256i sum = _mm256_add_epi16(madd, round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
+  const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+                                       const __m256i filter[4],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+  __m256i madds[4];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+                                       const __m256i filter[3],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  __m256i madds[3];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+                                            kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  __m256i madds[2];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+  filter[3] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+  filter[2] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[1] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s = LoadUnaligned32(src + x);
+      const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+      const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+      __m256i d[2];
+      d[0] = _mm256_slli_epi16(s0, 4);
+      d[1] = _mm256_slli_epi16(s1, 4);
+      StoreAligned64(*wiener_buffer + x, d);
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum0 = _mm256_add_epi32(round, madd0);
+  const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+  return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd = _mm256_madd_epi16(a, filter);
+  const __m256i sum = _mm256_add_epi32(round, madd);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  __m256i b[2];
+  const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+  const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+  const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+  b[0] = _mm256_unpacklo_epi16(a06, a15);
+  b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+  const __m256i sum0 = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a06, a15);
+  b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+  const __m256i sum1 = WienerVertical7(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2];
+  const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+  const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+  b[0] = _mm256_unpacklo_epi16(a04, a13);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  const __m256i sum0 = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a04, a13);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  const __m256i sum1 = WienerVertical5(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+  __m256i b;
+  const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+  b = _mm256_unpacklo_epi16(a02, a[1]);
+  const __m256i sum0 = WienerVertical3(b, filter);
+  b = _mm256_unpackhi_epi16(a02, a[1]);
+  const __m256i sum1 = WienerVertical3(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter, __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter, __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0);
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i filter =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+  const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+  const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+  const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+  const __m256i c0 = _mm256_srai_epi16(b0, 4);
+  const __m256i c1 = _mm256_srai_epi16(b1, 4);
+  const __m256i d = _mm256_packus_epi16(c0, c1);
+  StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 32;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 32);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  __m128i c_horizontal =
+      _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+  c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+  const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+  const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+  const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+  const __m256i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+  const __m256i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlLo8(src[0], src[1]);
+  const __m128i sum23 = VaddlLo8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlLo8(src[0], src[1]);
+  const __m256i sum23 = VaddlLo8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlHi8(src[0], src[1]);
+  const __m256i sum23 = VaddlHi8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes,
+                           __m256i* const dst0, __m256i* const dst1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+                          const ptrdiff_t over_read_in_bytes,
+                          __m256i* const row3_0, __m256i* const row3_1,
+                          __m256i* const row5_0, __m256i* const row5_1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+                          __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                          __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    const __m128i s0 =
+        LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+    __m128i sq_128[2], s3, s5, sq3[2], sq5[2];
+    __m256i sq[3];
+    sq_128[0] = SquareLo8(s0);
+    sq_128[1] = SquareHi8(s0);
+    SumHorizontalLo(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+                    &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[2];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int kOverreadInBytes_128, kOverreadInBytes_256;
+  if (size == 3) {
+    kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+  }
+  int y = 2;
+  do {
+    const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+    __m128i ss, sq_128[2], sqs[2];
+    __m256i sq[3];
+    sq_128[0] = SquareLo8(s);
+    sq_128[1] = SquareHi8(s);
+    if (size == 3) {
+      ss = Sum3Horizontal(s);
+      Sum3WHorizontal(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal(s);
+      Sum5WHorizontal(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row[2], row_sq[4];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      if (size == 3) {
+        Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[2];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB5(const __m256i sum, const __m256i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB3(const __m256i sum, const __m256i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x93);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  if (n == 9) {
+    b[0] = CalculateB3(sum[0], maq0);
+    b[1] = CalculateB3(sum[1], maq1);
+  } else {
+    b[0] = CalculateB5(sum[0], maq0);
+    b[1] = CalculateB5(sum[1], maq1);
+  }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  StoreAligned64(b444 + x, sum_b444);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+    __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+    __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[4], sq3[3][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+    __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0]);
+  sq[1][1] = SquareHi8(s[1]);
+  SumHorizontalLo(s[0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  // Note: in the SSE4_1 version, CalculateIntermediate() is called
+  // to replace the slow LookupIntermediate() when calculating 16 intermediate
+  // data points. However, the AVX2 compiler generates even slower code. So we
+  // keep using CalculateIntermediate3().
+  CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+    __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2],
+      sum_5[2], index_5[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+  b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2];
+  sq[1] = SquareLo8(s0);
+  sq[2] = SquareHi8(s0);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+  __m128i ma0, sq_128[2], b0;
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+                         x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64(ma343[0] + x, ma);
+    Sum343W(b3[0], b);
+    StoreAligned64(b343[0] + x, b);
+    Sum565W(b5, b);
+    StoreAligned64(b565, b);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64(ma565, ma);
+    Sum343W(b3[0] + 1, b);
+    StoreAligned64(b343[0] + x + 16, b);
+    Sum565W(b5 + 1, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma5[3], b[2][2][2];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
+    StoreAligned64(ma565[1] + x, ma + 1);
+    Sum565W(bs + 0, b[0][1]);
+    Sum565W(bs + 1, b[1][1]);
+    StoreAligned64(b565[1] + x + 0, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[1][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+    const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+    const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    ma[1] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[1][0]);
+    const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+    const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+                                &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma5[3], b[2][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
+    Sum565W(bs + 0, b[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565);
+    LoadAligned64(b565 + 0, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma565 + 16);
+    LoadAligned64(b565 + 16, b[0]);
+    Sum565W(bs + 1, b[1]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(src0 + x + 8,
+                         x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+                         sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    ma[0][2] = Sum565Hi(ma5x);
+    mat[0][1] = ma[0][2];
+    StoreAligned64(ma565[1] + x, ma[0] + 1);
+    Sum565W(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[0][1]);
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1][0]);
+    LoadAligned64(b444[0] + x + 16, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    LoadAligned64(b343[1] + x + 16, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+  __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_0, b3_0);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+    BoxFilterPreProcessLastRow(src0 + x + 8,
+                               x + 8 + kOverreadInBytesPass1_256 - width,
+                               sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565 + x);
+    LoadAligned64(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    mat[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    LoadAligned64(b343 + x + 16, b[0]);
+    LoadAligned64(b444 + x + 16, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h
new file mode 100644 (file)
index 0000000..2c3534a
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
new file mode 100644 (file)
index 0000000..8c24c39
--- /dev/null
@@ -0,0 +1,2590 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+  const __m128i offsets = _mm_set1_epi16(-offset);
+  const __m128i limits = _mm_set1_epi16(limit - offset);
+  // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+  const __m128i sum = _mm_add_epi16(s[0], s[1]);
+  const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
+  const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
+  const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
+  const __m128i d1 = _mm_min_epi16(d0, limits);
+  StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+                                       const __m128i filter[4],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[4];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm_add_epi16(madds[1], madds[3]);
+  const __m128i s_3x128 =
+      _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
+                                       const __m128i filter[3],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[3];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
+  const __m128i s_3x128 =
+      _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m128i s[2],
+                                       const __m128i filter[2],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[2];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  const __m128i s_3x128 =
+      _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient0,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[4];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
+  filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+  filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
+  filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[7], ss[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+      ss[3] = _mm_unpacklo_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+      ss[3] = _mm_unpackhi_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient1,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[3];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
+  filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
+  filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[5], ss[3];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient2,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+  filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[3], ss[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m128i s = LoadUnaligned16(src + x);
+      const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
+      const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
+      const __m128i d0 = _mm_slli_epi16(s0, 4);
+      const __m128i d1 = _mm_slli_epi16(s1, 4);
+      StoreAligned16(*wiener_buffer + x + 0, d0);
+      StoreAligned16(*wiener_buffer + x + 8, d1);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum0 = _mm_add_epi32(round, madd0);
+  const __m128i sum1 = _mm_add_epi32(sum0, madd1);
+  return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum = _mm_add_epi32(madd0, madd1);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m128i madd = _mm_madd_epi16(a, filter);
+  const __m128i sum = _mm_add_epi32(round, madd);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+                                     const __m128i filter[2]) {
+  __m128i b[2];
+  const __m128i a06 = _mm_add_epi16(a[0], a[6]);
+  const __m128i a15 = _mm_add_epi16(a[1], a[5]);
+  const __m128i a24 = _mm_add_epi16(a[2], a[4]);
+  b[0] = _mm_unpacklo_epi16(a06, a15);
+  b[1] = _mm_unpacklo_epi16(a24, a[3]);
+  const __m128i sum0 = WienerVertical7(b, filter);
+  b[0] = _mm_unpackhi_epi16(a06, a15);
+  b[1] = _mm_unpackhi_epi16(a24, a[3]);
+  const __m128i sum1 = WienerVertical7(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[2];
+  const __m128i a04 = _mm_add_epi16(a[0], a[4]);
+  const __m128i a13 = _mm_add_epi16(a[1], a[3]);
+  b[0] = _mm_unpacklo_epi16(a04, a13);
+  b[1] = _mm_unpacklo_epi16(a[2], round);
+  const __m128i sum0 = WienerVertical5(b, filter);
+  b[0] = _mm_unpackhi_epi16(a04, a13);
+  b[1] = _mm_unpackhi_epi16(a[2], round);
+  const __m128i sum1 = WienerVertical5(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
+  __m128i b;
+  const __m128i a02 = _mm_add_epi16(a[0], a[2]);
+  b = _mm_unpacklo_epi16(a02, a[1]);
+  const __m128i sum0 = WienerVertical3(b, filter);
+  b = _mm_unpackhi_epi16(a02, a[1]);
+  const __m128i sum1 = WienerVertical3(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[7]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[5]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter, __m128i a[3]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter[2], __m128i d[2]) {
+  __m128i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter[2], __m128i d[2]) {
+  __m128i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter, __m128i d[2]) {
+  __m128i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi32(c, 0x55);
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[7];
+      const __m128i d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = Load4(coefficients);
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(c, 0);
+  filter[1] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[5];
+      const __m128i d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i filter =
+      _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[3];
+      const __m128i d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const __m128i a0 = LoadAligned16(wiener_buffer + 0);
+  const __m128i a1 = LoadAligned16(wiener_buffer + 8);
+  const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
+  const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
+  const __m128i c0 = _mm_srai_epi16(b0, 4);
+  const __m128i c1 = _mm_srai_epi16(b1, 4);
+  const __m128i d = _mm_packus_epi16(c0, c1);
+  StoreAligned16(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const int16_t* const filter_horizontal =
+      restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+  const __m128i c = LoadLo8(filter_horizontal);
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  const __m128i coefficients_horizontal =
+      _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[0], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal[0], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[0],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[1], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal[1], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[1],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[2], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal[2], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[2],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
+}
+
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+  const __m128i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlLo8(src[0], src[1]);
+  const __m128i sum23 = VaddlLo8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m128i Sum5WHi16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlHi8(src[0], src[1]);
+  const __m128i sum23 = VaddlHi8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+                           __m128i* const dst1) {
+  __m128i s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+                   __m128i* const row3_1, __m128i* const row5_0,
+                   __m128i* const row5_1) {
+  __m128i s[5];
+  Prepare5_8<offset>(src, s);
+  const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src,
+                                 sum_width - x + kOverreadInBytesPass1 - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  constexpr int kOverreadInBytes =
+      (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      if (size == 3) {
+        Sum3Horizontal<0>(s, row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal<0>(s, &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  // offset == 0 is assumed to be the first call to this function. The value is
+  // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
+  // equivalent if not faster than pinsrb.
+  if (offset == 0) {
+    *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  } else {
+    *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  }
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i* const b0,
+                                  __m128i* const b1) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b0 = CalculateB3(sum[0], maq0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  *b1 = CalculateB3(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[2]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  StoreAligned32U32(b444 + x, sum_b444);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  Sum5WHorizontal(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5WHorizontal(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[2] = SquareLo8(s[1]);
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum5WHorizontal(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  Sum3Horizontal<8>(s, s3 + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum3WHorizontal(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343W(b3[0] + 0, b + 0);
+    Sum343W(b3[0] + 1, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565W(b5 + 0, b + 0);
+    Sum565W(b5 + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m128i v = _mm_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+                                       const __m128i b[2]) {
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
+  const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+  __m128i b_sum[2];
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
+  __m128i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+  const __m128i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+  return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
+  __m128i v[2];
+  const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+  const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m128i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565W(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    sr[0] = LoadAligned16(src + x);
+    sr[1] = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565W(bs + 1, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565W(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565W(bs + 1, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  ma5[1] = _mm_setzero_si128();  // Quiets -Wmaybe-unintialized with gcc.
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565W(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0 = LoadAligned16(src + x);
+    const __m128i sr1 = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+                               square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+                               &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h
new file mode 100644 (file)
index 0000000..00df3af
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
new file mode 100644 (file)
index 0000000..833814c
--- /dev/null
@@ -0,0 +1,945 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 = LoadUnaligned16(mask);
+    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRounding_U16(mask_0, 2);
+  }
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    return RightShiftWithRounding_U16(subsampled_mask, 1);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+// Imitate behavior of ARM vtrn1q_u64.
+inline __m128i Transpose1_U64(const __m128i a, const __m128i b) {
+  return _mm_castps_si128(
+      _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Imitate behavior of ARM vtrn2q_u64.
+inline __m128i Transpose2_U64(const __m128i a, const __m128i b) {
+  return _mm_castps_si128(
+      _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Width can only be 4 when it is subsampled from a block of width 8, hence
+// subsampling_x is always 1 when this function is called.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i mask_val_01 = LoadUnaligned16(mask);
+    // Stride is fixed because this is the smallest block size.
+    const __m128i mask_val_23 = LoadUnaligned16(mask + 16);
+    // Transpose rows to add row 0 to row 1, and row 2 to row 3.
+    const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23);
+    const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13);
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRounding_U16(mask_0, 2);
+  }
+  return GetMask8<subsampling_x, 0>(mask, 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask4x2(const uint8_t* mask,
+                                    ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    return GetMask4x2<subsampling_x, subsampling_y>(mask);
+  }
+  // When using intra or difference weighted masks, the function doesn't use
+  // subsampling, so |mask_stride| may be 4 or 8.
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val_0 = Load4(mask);
+  const __m128i mask_val_1 = Load4(mask + mask_stride);
+  return _mm_cvtepu8_epi16(
+      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
+// 16-bit is also the lowest packing for hadd, but without subsampling there is
+// an unfortunate conversion required.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
+                        ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+    if (subsampling_y == 1) {
+      const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+      const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+      const __m128i next_mask_val_1 =
+          _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+      subsampled_mask = _mm_add_epi16(
+          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+    }
+    return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+                                  const int16_t* LIBGAV1_RESTRICT const pred_1,
+                                  const __m128i pred_mask_0,
+                                  const __m128i pred_mask_1,
+                                  uint8_t* LIBGAV1_RESTRICT dst,
+                                  const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadAligned16(pred_0);
+  const __m128i pred_val_1 = LoadAligned16(pred_1);
+  const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+  const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+  const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+  const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+  const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+  const __m128i compound_pred = _mm_packus_epi32(
+      _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
+  const __m128i res = _mm_packus_epi16(result, result);
+  Store4(dst, res);
+  Store4(dst + dst_stride, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                   const int16_t* LIBGAV1_RESTRICT pred_1,
+                                   const uint8_t* LIBGAV1_RESTRICT mask,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += 4 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height,
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  assert(subsampling_x == 1);
+  const uint8_t* mask = mask_ptr;
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+  if (height == 4) {
+    MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask,
+                                                         dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  int y = 0;
+  do {
+    __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             const ptrdiff_t /*prediction_stride_1*/,
+                             const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                             const ptrdiff_t mask_stride, const int width,
+                             const int height, void* LIBGAV1_RESTRICT dest,
+                             const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = width;
+  if (width == 4) {
+    MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask_ptr, height, dst, dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+      const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+      const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+
+      const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
+      const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+      const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+      // int res = (mask_value * prediction_0[x] +
+      //      (64 - mask_value) * prediction_1[x]) >> 6;
+      const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+      const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+
+      const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
+                                           _mm_srli_epi32(compound_pred_hi, 6));
+      // dst[x] = static_cast<Pixel>(
+      //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+      //           (1 << kBitdepth8) - 1));
+      const __m128i result = RightShiftWithRounding_S16(res, 4);
+      StoreLo8(dst + x, _mm_packus_epi16(result, result));
+
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+    const uint8_t* LIBGAV1_RESTRICT const pred_0,
+    uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+    const __m128i pred_mask_0, const __m128i pred_mask_1) {
+  const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+  const __m128i pred_val_0 = LoadLo8(pred_0);
+  __m128i pred_val_1 = Load4(pred_1);
+  pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
+                            pred_val_1);
+  const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+  // int res = (mask_value * prediction_1[x] +
+  //      (64 - mask_value) * prediction_0[x]) >> 6;
+  const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+  const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+  const __m128i res = _mm_packus_epi16(result, result);
+
+  Store4(pred_1, res);
+  Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_SSE4_1(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride) {
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const __m128i pred_mask_u16_first =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  mask += mask_stride << (1 + subsampling_y);
+  const __m128i pred_mask_u16_second =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  mask += mask_stride << (1 + subsampling_y);
+  __m128i pred_mask_1 =
+      _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
+  __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+
+  pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
+  pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_SSE4_1(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int height) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    return;
+  }
+  int y = 0;
+  do {
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+    y += 8;
+  } while (y < height);
+}
+
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask,
+                                      ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride);
+    return _mm_packus_epi16(ret, ret);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  // Unfortunately there is no shift operation for 8-bit packing, or else we
+  // could return everything with 8-bit packing.
+  const __m128i mask_val = LoadLo8(mask);
+  return mask_val;
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_SSE4_1(
+    const uint8_t* LIBGAV1_RESTRICT prediction_0,
+    uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height) {
+  if (width == 4) {
+    InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_1 =
+          GetInterIntraMask8bpp8<subsampling_x, subsampling_y>(
+              mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+      const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+      const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
+      const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
+      const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+      // int res = (mask_value * prediction_1[x] +
+      //      (64 - mask_value) * prediction_0[x]) >> 6;
+      const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+      const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+      const __m128i res = _mm_packus_epi16(result, result);
+
+      StoreLo8(prediction_1 + x, res);
+
+      x += 8;
+    } while (x < width);
+    prediction_0 += width;
+    prediction_1 += prediction_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
+  dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
+  dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
+  dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>;
+#endif
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kMaskInverse = 64;
+constexpr int kRoundBitsMaskBlend = 4;
+
+inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
+                                               const __m128i shift) {
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i mask_row_01 = LoadUnaligned16(mask);
+    const __m128i mask_row_23 = LoadUnaligned16(mask + 16);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+    const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23);
+    const __m128i mask_val_3 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8));
+    const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2);
+    const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3);
+    const __m128i subsampled_mask =
+        _mm_add_epi16(subsampled_mask_02, subsampled_mask_13);
+    return RightShiftWithRounding_U16(subsampled_mask, 2);
+  }
+  if (subsampling_x == 1) {
+    const __m128i mask_row_01 = LoadUnaligned16(mask);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+    const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    return RightShiftWithRounding_U16(subsampled_mask, 1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8(mask));
+}
+
+inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const __m128i& pred_mask_0, const __m128i& pred_mask_1,
+    const __m128i& offset, const __m128i& max, const __m128i& shift4,
+    uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(pred_0);
+  const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
+
+  // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+  const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1);
+  const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+  const __m128i pack0_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack0_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack1_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i pack1_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+  const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+  // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+  const __m128i sub_0 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+  const __m128i sub_1 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                                     const uint16_t* LIBGAV1_RESTRICT pred_1,
+                                     const ptrdiff_t pred_stride_1,
+                                     const uint8_t* LIBGAV1_RESTRICT mask,
+                                     const ptrdiff_t mask_stride,
+                                     uint16_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int height, uint16_t* LIBGAV1_RESTRICT dst,
+    const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp_SSE4_1(
+    const void* LIBGAV1_RESTRICT prediction_0,
+    const void* LIBGAV1_RESTRICT prediction_1,
+    const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height, void* LIBGAV1_RESTRICT dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+      const __m128i compound_pred_lo_0 =
+          _mm_mullo_epi16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_hi_0 =
+          _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_lo_1 =
+          _mm_mullo_epi16(pred_val_1, pred_mask_1);
+      const __m128i compound_pred_hi_1 =
+          _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+      const __m128i pack0_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack0_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack1_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i pack1_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+      const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+
+      const __m128i sub_0 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+      const __m128i sub_1 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+      const __m128i result =
+          _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+      StoreUnaligned16(dst + x, result);
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT prediction_0,
+    const uint16_t* LIBGAV1_RESTRICT prediction_1,
+    const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+    const __m128i& pred_mask_1, const __m128i& shift6,
+    uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
+  const __m128i pred_val_1 =
+      LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
+
+  const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+  const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+  const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+  const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+  const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+  const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+  const __m128i res = _mm_packus_epi32(shift_0, shift_1);
+  StoreLo8(dst, res);
+  StoreHi8(dst + dst_stride, res);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride,
+    uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  __m128i pred_mask_0 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int height, uint16_t* LIBGAV1_RESTRICT dst,
+    const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  int y = height;
+  do {
+    __m128i pred_mask_0 =
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp_SSE4_1(
+    const void* LIBGAV1_RESTRICT prediction_0,
+    const void* LIBGAV1_RESTRICT prediction_1,
+    const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height, void* LIBGAV1_RESTRICT dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+      const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+      const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+      const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+      const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+      const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+      const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+      StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1));
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444)
+  dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422)
+  dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420)
+  dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444)
+  dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422)
+  dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420)
+  dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h
new file mode 100644 (file)
index 0000000..4a95f0c
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
new file mode 100644 (file)
index 0000000..5641531
--- /dev/null
@@ -0,0 +1,382 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+                            const __m128i reference_offset) {
+  const __m128i kOne = _mm_set1_epi16(0x0100);
+  const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+  const __m128i tt = _mm_unpacklo_epi8(t, t);
+  const __m128i idx = _mm_add_epi8(tt, kOne);
+  return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const int numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+                                const int numerator) {
+  const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+  const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+  const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+  const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+  const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+  const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+  const __m128i projection = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+  const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+  const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+  const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+  const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+  return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+    const __m128i division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const __m128i& r_offsets, const __m128i& source_reference_type8,
+    const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8,
+    const __m128i& d_sign, const int delta, __m128i* const r,
+    __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+  *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+  const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+  __m128i projection_mv[2];
+  mvs[0] = LoadUnaligned16(mv_int + 0);
+  mvs[1] = LoadUnaligned16(mv_int + 4);
+  // Deinterlace x and y components
+  const __m128i kShuffle =
+      _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+  const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+  const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+  const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+  const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+  // Do not update the motion vector if the block position is not valid or
+  // if position_x8 is outside the current range of x8_start and x8_end.
+  // Note that position_y8 will always be within the range of y8_start and
+  // y8_end.
+  // After subtracting the base, valid projections are within 8-bit.
+  const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+  const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+  const __m128i positions = _mm_packs_epi16(position_x, position_y);
+  const __m128i k01234567 =
+      _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+  *position_xy = _mm_add_epi8(positions, k01234567);
+  const int x8_floor = std::max(
+      x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
+  const int x8_ceiling =
+      std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+      1;  // [-1, 15]
+  const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+  const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+  const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+  const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+  const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+  const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+  const __m128i out = _mm_or_si128(underflow, overflow);
+  const __m128i skip_low = _mm_or_si128(skip_r, out);
+  const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+  StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+                  const __m128i mv, int8_t* dst_reference_offset,
+                  MotionVector* dst_mv) {
+  const ptrdiff_t offset =
+      static_cast<int16_t>(_mm_extract_epi16(position, idx));
+  if ((idx & 3) == 0) {
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
+  } else {
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
+  }
+  dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+                       const __m128i reference_offset, const __m128i mv,
+                       int8_t* dst_reference_offset, MotionVector* dst_mv) {
+  if (skips[idx] == 0) {
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+  }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+    const ReferenceInfo& reference_info,
+    const int reference_to_current_with_sign, const int dst_sign,
+    const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+    TemporalMotionField* const motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+  const int leftover = adjusted_x8_end - adjusted_x8_end8;
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+  static_assert(sizeof(int8_t) == sizeof(bool), "");
+  static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+  static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+  assert(dst_sign == 0 || dst_sign == -1);
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+  assert((adjusted_x8_start & 7) == 0);
+  // The final position calculation is represented with int16_t. Valid
+  // position_y8 from its base is at most 7. After considering the horizontal
+  // offset which is at most |stride - 1|, we have the following assertion,
+  // which means this optimization works for frame width up to 32K (each
+  // position is a 8x8 block).
+  assert(8 * stride <= 32768);
+  const __m128i skip_reference = LoadLo8(skip_references);
+  const __m128i r_offsets = LoadLo8(reference_offsets);
+  const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;                             // [-7, 0]
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1;  // [0, 7]
+    const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+    const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+    int x8;
+
+    for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+      const __m128i source_reference_type8 =
+          LoadLo8(source_reference_types + x8);
+      const __m128i skip_r =
+          _mm_shuffle_epi8(skip_reference, source_reference_type8);
+      int64_t early_skip;
+      StoreLo8(&early_skip, skip_r);
+      // Early termination #1 if all are skips. Chance is typically ~30-40%.
+      if (early_skip == -1) continue;
+      int64_t skip_64;
+      __m128i r, position_xy, mvs[2];
+      GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+                  x8_end, x8, r_offsets, source_reference_type8, skip_r,
+                  y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+                  mvs);
+      // Early termination #2 if all are skips.
+      // Chance is typically ~15-25% after Early termination #1.
+      if (skip_64 == -1) continue;
+      const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+      const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+      const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+      const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+      const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+      if (skip_64 == 0) {
+        // Store all. Chance is typically ~70-85% after Early termination #2.
+        Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+      } else {
+        // Check and store each.
+        // Chance is typically ~15-30% after Early termination #2.
+        // The compiler is smart enough to not create the local buffer skips[].
+        int8_t skips[8];
+        memcpy(skips, &skip_64, sizeof(skips));
+        CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+      }
+    }
+
+    // The following leftover processing cannot be moved out of the do...while
+    // loop. Doing so may change the result storing orders of the same position.
+    if (leftover > 0) {
+      // Use SIMD only when leftover is at least 4, and there are at least 8
+      // elements in a row.
+      if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+        // Process the last 8 elements to avoid loading invalid memory. Some
+        // elements may have been processed in the above loop, which is OK.
+        const int delta = 8 - leftover;
+        x8 = adjusted_x8_end - 8;
+        const __m128i source_reference_type8 =
+            LoadLo8(source_reference_types + x8);
+        const __m128i skip_r =
+            _mm_shuffle_epi8(skip_reference, source_reference_type8);
+        int64_t early_skip;
+        StoreLo8(&early_skip, skip_r);
+        // Early termination #1 if all are skips.
+        if (early_skip != -1) {
+          int64_t skip_64;
+          __m128i r, position_xy, mvs[2];
+          GetPosition(division_table, mv, reference_to_current_with_sign,
+                      x8_start, x8_end, x8, r_offsets, source_reference_type8,
+                      skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+                      &position_xy, &skip_64, mvs);
+          // Early termination #2 if all are skips.
+          if (skip_64 != -1) {
+            const __m128i p_y =
+                _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+            const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+            const __m128i p_y_offset =
+                _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+            const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+            const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+            // Store up to 7 elements since leftover is at most 7.
+            if (skip_64 == 0) {
+              // Store all.
+              Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+            } else {
+              // Check and store each.
+              // The compiler is smart enough to not create the local buffer
+              // skips[].
+              int8_t skips[8];
+              memcpy(skips, &skip_64, sizeof(skips));
+              CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+            }
+          }
+        }
+      } else {
+        for (; x8 < adjusted_x8_end; ++x8) {
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
+          MotionVector projection_mv;
+          // reference_to_current_with_sign could be 0.
+          GetMvProjection(mv[x8], reference_to_current_with_sign,
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
+          // Do not update the motion vector if the block position is not valid
+          // or if position_x8 is outside the current range of x8_start and
+          // x8_end. Note that position_y8 will always be within the range of
+          // y8_start and y8_end.
+          const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+          if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+          const int x8_base = x8 & ~7;
+          const int x8_floor =
+              std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+          const int x8_ceiling =
+              std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+          const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+          if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+          dst_mv[position_y8 * stride + position_x8] = mv[x8];
+          dst_reference_offset[position_y8 * stride + position_x8] =
+              reference_offsets[source_reference_type];
+        }
+      }
+    }
+
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+}  // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_field_projection_sse4.h b/src/dsp/x86/motion_field_projection_sse4.h
new file mode 100644 (file)
index 0000000..c05422c
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644 (file)
index 0000000..dacc6ec
--- /dev/null
@@ -0,0 +1,251 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+    0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+    1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
+    744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const __m128i numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+                                const __m128i denominators[2],
+                                const __m128i numerator) {
+  const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+  const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+  const __m128i mv = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t temporal_reference_offsets[2],
+    const int reference_offsets[2]) {
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadLo8(tmvs);
+  const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+  mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+  denominators[0] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+  denominators[1] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+  const __m128i offsets = LoadLo8(reference_offsets);
+  const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset) {
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadAligned16(tmvs);
+  __m128i lookup = _mm_cvtsi32_si128(
+      kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+      1);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+      2);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+      3);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+  mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+  denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+  denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+  const __m128i numerator = _mm_set1_epi32(reference_offset);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+  const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+  StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+  const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+  const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+  StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 4;
+  } while (i < count);
+}
+
+}  // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_vector_search_sse4.h b/src/dsp/x86/motion_vector_search_sse4.h
new file mode 100644 (file)
index 0000000..d65b392
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
new file mode 100644 (file)
index 0000000..f068ff3
--- /dev/null
@@ -0,0 +1,584 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 2;
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+  int y = height;
+  do {
+    const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
+    const __m128i obmc_pred_val = Load4(obmc_pred);
+
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store2(pred, packed_result);
+    pred += prediction_stride;
+    const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
+    memcpy(pred, &second_row_result, sizeof(second_row_result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 4;
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = Load4(kObmcMask + 2);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  // Duplicate first half of vector.
+  const __m128i masks =
+      _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
+  int y = height;
+  do {
+    const __m128i pred_val0 = Load4(pred);
+    pred += prediction_stride;
+
+    // Place the second row of each source in the second four bytes.
+    const __m128i pred_val =
+        _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store4(pred - prediction_stride, packed_result);
+    const int second_row_result = _mm_extract_epi32(packed_result, 1);
+    memcpy(pred, &second_row_result, sizeof(second_row_result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft8xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 8;
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const __m128i mask_val = LoadLo8(kObmcMask + 6);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+  int y = height;
+  do {
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+    const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result_lo =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+
+    const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+    const __m128i result_hi =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+
+    const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+    StoreLo8(pred, result);
+    pred += prediction_stride;
+    StoreHi8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+void OverlapBlendFromLeft_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint8_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+    const __m128i mask_val = LoadUnaligned16(mask + x);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
+
+    int y = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+      const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+      const __m128i result_lo =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
+      const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+      const __m128i result_hi =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
+      StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
+
+      pred += prediction_stride;
+      obmc_pred += obmc_prediction_stride;
+    } while (++y < height);
+    x += 16;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 4;
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(
+        _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
+        mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i pred_val0 = Load4(pred);
+
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+    pred += prediction_stride;
+    const __m128i pred_val =
+        _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+    const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store4(pred - prediction_stride, packed_result);
+    Store4(pred, _mm_srli_si128(packed_result, 4));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y += 2;
+  } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop8xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 8;
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const uint8_t* mask = kObmcMask + height - 2;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const int compute_height = height - (height >> 2);
+  int y = compute_height;
+  do {
+    const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]);
+    // 64 - mask
+    const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0);
+    const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0);
+
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+    const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result_lo =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6);
+
+    --y;
+    const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]);
+    // 64 - mask
+    const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1);
+    const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1);
+
+    const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+    const __m128i result_hi =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6);
+
+    const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+    StoreLo8(pred, result);
+    pred += prediction_stride;
+    StoreHi8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+  } while (--y > 0);
+}
+
+void OverlapBlendFromTop_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+    return;
+  }
+
+  // Stop when mask value becomes 64.
+  const int compute_height = height - (height >> 2);
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  int y = 0;
+  const uint8_t* mask = kObmcMask + height - 2;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    int x = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred + x);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+      const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+      const __m128i result_lo =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+      const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+      const __m128i result_hi =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+      StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
+      x += 16;
+    } while (x < width);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y < compute_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kRoundBitsObmcBlend = 6;
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+    uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 2;
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = Load4x2(pred, pred + pred_stride);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+    const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i result = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result, result);
+    Store4(pred, packed_result);
+    Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+    uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 4;
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = Load4(kObmcMask + 2);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+    const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+void OverlapBlendFromLeft10bpp_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint16_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
+    const __m128i mask_val = LoadLo8(mask + x);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int y = height;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
+
+      pred += pred_stride;
+      obmc_pred += obmc_pred_stride;
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+    uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 4;
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+    const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y += 2;
+  } while (y < compute_height);
+}
+
+void OverlapBlendFromTop10bpp_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
+    return;
+  }
+
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const int compute_height = height - (height >> 2);
+  const uint8_t* mask = kObmcMask + height - 2;
+  pred = static_cast<uint16_t*>(prediction);
+  obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  int y = 0;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int x = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred + x);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
+      x += 8;
+    } while (x < width);
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  } while (++y < compute_height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h
new file mode 100644 (file)
index 0000000..448d2cf
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend[]. This function is not thread-safe.
+void ObmcInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
new file mode 100644 (file)
index 0000000..458d94e
--- /dev/null
@@ -0,0 +1,323 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+    kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, -128, 0, 0, 0, 0},       {0, 0, 1, -128, -2, 1, 0, 0},
+        {0, -1, 3, -127, -4, 2, -1, 0},    {0, -1, 4, -127, -6, 3, -1, 0},
+        {0, -2, 6, -126, -8, 3, -1, 0},    {0, -2, 7, -125, -11, 4, -1, 0},
+        {1, -2, 8, -125, -13, 5, -2, 0},   {1, -3, 9, -124, -15, 6, -2, 0},
+        {1, -3, 10, -123, -18, 6, -2, 1},  {1, -3, 11, -122, -20, 7, -3, 1},
+        {1, -4, 12, -121, -22, 8, -3, 1},  {1, -4, 13, -120, -25, 9, -3, 1},
+        {1, -4, 14, -118, -28, 9, -3, 1},  {1, -4, 15, -117, -30, 10, -4, 1},
+        {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+        {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+        {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+        {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+        {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+        {1, -6, 20, -97, -58, 17, -6, 1},  {1, -6, 20, -95, -61, 18, -6, 1},
+        {2, -7, 20, -93, -64, 18, -6, 2},  {2, -7, 20, -91, -66, 19, -6, 1},
+        {2, -7, 20, -88, -69, 19, -6, 1},  {2, -7, 20, -86, -71, 19, -6, 1},
+        {2, -7, 20, -84, -74, 20, -7, 2},  {2, -7, 20, -81, -76, 20, -7, 1},
+        {2, -7, 20, -79, -79, 20, -7, 2},  {1, -7, 20, -76, -81, 20, -7, 2},
+        {2, -7, 20, -74, -84, 20, -7, 2},  {1, -6, 19, -71, -86, 20, -7, 2},
+        {1, -6, 19, -69, -88, 20, -7, 2},  {1, -6, 19, -66, -91, 20, -7, 2},
+        {2, -6, 18, -64, -93, 20, -7, 2},  {1, -6, 18, -61, -95, 20, -6, 1},
+        {1, -6, 17, -58, -97, 20, -6, 1},  {1, -6, 17, -56, -99, 20, -6, 1},
+        {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+        {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+        {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+        {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+        {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+        {1, -3, 9, -28, -118, 14, -4, 1},  {1, -3, 9, -25, -120, 13, -4, 1},
+        {1, -3, 8, -22, -121, 12, -4, 1},  {1, -3, 7, -20, -122, 11, -3, 1},
+        {1, -2, 6, -18, -123, 10, -3, 1},  {0, -2, 6, -15, -124, 9, -3, 1},
+        {0, -2, 5, -13, -125, 8, -2, 1},   {0, -1, 4, -11, -125, 7, -2, 0},
+        {0, -1, 3, -8, -126, 6, -2, 0},    {0, -1, 3, -6, -127, 4, -1, 0},
+        {0, -1, 2, -4, -127, 3, -1, 0},    {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 4);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 16) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      remainder = subpixel_x & kSuperResScaleMask;
+      filter = LoadHi8(filter,
+                       kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+                     void* LIBGAV1_RESTRICT const source,
+                     const ptrdiff_t source_stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* LIBGAV1_RESTRICT const dest,
+                     const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 15 extra upscaled pixels which will
+    // over-read up to 15 downscaled pixels in the end of each row.
+    // kSuperResHorizontalPadding protects this behavior from segmentation
+    // faults and threading issues.
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 16) {
+        // TODO(b/178652672): Remove Msan loads when hadd bug is resolved.
+        // It's fine to write uninitialized bytes outside the frame, but the
+        // inside-frame pixels are incorrectly labeled uninitialized if
+        // uninitialized values go through the hadd intrinsics.
+        // |src| is offset 4 pixels to the left, and there are 4 extended border
+        // pixels, so a difference of 0 from |downscaled_width| indicates 8 good
+        // bytes. A difference of 1 indicates 7 good bytes.
+        const int msan_bytes_lo =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        __m128i s =
+            LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo);
+        subpixel_x += step;
+        const int msan_bytes_hi =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits],
+                        msan_bytes_hi);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_maddubs_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+      Transpose2x16_U16(a, a);
+      a[0] = _mm_adds_epi16(a[0], a[1]);
+      a[1] = _mm_adds_epi16(a[2], a[3]);
+      const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+      a[0] = _mm_subs_epi16(rounding, a[0]);
+      a[1] = _mm_subs_epi16(rounding, a[1]);
+      a[0] = _mm_srai_epi16(a[0], kFilterBits);
+      a[1] = _mm_srai_epi16(a[1], kFilterBits);
+      StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+#if DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+  dsp->super_res = SuperRes_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+alignas(16) const int16_t
+    kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, 128, 0, 0, 0, 0},        {0, 0, -1, 128, 2, -1, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},      {0, 1, -4, 127, 6, -3, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},      {0, 2, -7, 125, 11, -4, 1, 0},
+        {-1, 2, -8, 125, 13, -5, 2, 0},    {-1, 3, -9, 124, 15, -6, 2, 0},
+        {-1, 3, -10, 123, 18, -6, 2, -1},  {-1, 3, -11, 122, 20, -7, 3, -1},
+        {-1, 4, -12, 121, 22, -8, 3, -1},  {-1, 4, -13, 120, 25, -9, 3, -1},
+        {-1, 4, -14, 118, 28, -9, 3, -1},  {-1, 4, -15, 117, 30, -10, 4, -1},
+        {-1, 5, -16, 116, 32, -11, 4, -1}, {-1, 5, -16, 114, 35, -12, 4, -1},
+        {-1, 5, -17, 112, 38, -12, 4, -1}, {-1, 5, -18, 111, 40, -13, 5, -1},
+        {-1, 5, -18, 109, 43, -14, 5, -1}, {-1, 6, -19, 107, 45, -14, 5, -1},
+        {-1, 6, -19, 105, 48, -15, 5, -1}, {-1, 6, -19, 103, 51, -16, 5, -1},
+        {-1, 6, -20, 101, 53, -16, 6, -1}, {-1, 6, -20, 99, 56, -17, 6, -1},
+        {-1, 6, -20, 97, 58, -17, 6, -1},  {-1, 6, -20, 95, 61, -18, 6, -1},
+        {-2, 7, -20, 93, 64, -18, 6, -2},  {-2, 7, -20, 91, 66, -19, 6, -1},
+        {-2, 7, -20, 88, 69, -19, 6, -1},  {-2, 7, -20, 86, 71, -19, 6, -1},
+        {-2, 7, -20, 84, 74, -20, 7, -2},  {-2, 7, -20, 81, 76, -20, 7, -1},
+        {-2, 7, -20, 79, 79, -20, 7, -2},  {-1, 7, -20, 76, 81, -20, 7, -2},
+        {-2, 7, -20, 74, 84, -20, 7, -2},  {-1, 6, -19, 71, 86, -20, 7, -2},
+        {-1, 6, -19, 69, 88, -20, 7, -2},  {-1, 6, -19, 66, 91, -20, 7, -2},
+        {-2, 6, -18, 64, 93, -20, 7, -2},  {-1, 6, -18, 61, 95, -20, 6, -1},
+        {-1, 6, -17, 58, 97, -20, 6, -1},  {-1, 6, -17, 56, 99, -20, 6, -1},
+        {-1, 6, -16, 53, 101, -20, 6, -1}, {-1, 5, -16, 51, 103, -19, 6, -1},
+        {-1, 5, -15, 48, 105, -19, 6, -1}, {-1, 5, -14, 45, 107, -19, 6, -1},
+        {-1, 5, -14, 43, 109, -18, 5, -1}, {-1, 5, -13, 40, 111, -18, 5, -1},
+        {-1, 4, -12, 38, 112, -17, 5, -1}, {-1, 4, -12, 35, 114, -16, 5, -1},
+        {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
+        {-1, 3, -9, 28, 118, -14, 4, -1},  {-1, 3, -9, 25, 120, -13, 4, -1},
+        {-1, 3, -8, 22, 121, -12, 4, -1},  {-1, 3, -7, 20, 122, -11, 3, -1},
+        {-1, 2, -6, 18, 123, -10, 3, -1},  {0, 2, -6, 15, 124, -9, 3, -1},
+        {0, 2, -5, 13, 125, -8, 2, -1},    {0, 1, -4, 11, 125, -7, 2, 0},
+        {0, 1, -3, 8, 126, -6, 2, 0},      {0, 1, -3, 6, 127, -4, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},      {0, 0, -1, 2, 128, -1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 8) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+template <int bitdepth>
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+                     void* LIBGAV1_RESTRICT const source,
+                     const ptrdiff_t source_stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* LIBGAV1_RESTRICT const dest,
+                     const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, kSuperResHorizontalPadding);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalPadding accounts for this.
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 8) {
+        const __m128i s =
+            LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_madd_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
+
+      a[0] = _mm_hadd_epi32(a[0], a[1]);
+      a[1] = _mm_hadd_epi32(a[2], a[3]);
+      a[0] = RightShiftWithRounding_S32(a[0], kFilterBits);
+      a[1] = RightShiftWithRounding_S32(a[1], kFilterBits);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(
+          _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1));
+      StoreAligned16(dst_ptr, clipped_16);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients)
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#else
+  static_cast<void>(SuperResCoefficients_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SuperRes)
+  dsp->super_res = SuperRes_SSE4_1<10>;
+#else
+  static_cast<void>(SuperRes_SSE4_1);
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h
new file mode 100644 (file)
index 0000000..07a7ef4
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res_row. This function is not thread-safe.
+void SuperResInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h
new file mode 100644 (file)
index 0000000..9726495
--- /dev/null
@@ -0,0 +1,307 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <emmintrin.h>
+
+namespace libgav1 {
+namespace dsp {
+
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+                                             __m128i* const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]:  00 01 10 11  20 21 30 31
+  // in[1]:  40 41 50 51  60 61 70 71
+  // in[2]:  80 81 90 91  a0 a1 b0 b1
+  // in[3]:  c0 c1 d0 d1  e0 e1 f0 f1
+  // to:
+  // a0:     00 40 01 41  10 50 11 51
+  // a1:     20 60 21 61  30 70 31 71
+  // a2:     80 c0 81 c1  90 d0 91 d1
+  // a3:     a0 e0 a1 e1  b0 f0 b1 f1
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+  // b0:     00 20 40 60  01 21 41 61
+  // b1:     10 30 50 70  11 31 51 71
+  // b2:     80 a0 c0 e0  81 a1 c1 e1
+  // b3:     90 b0 d0 f0  91 b1 d1 f1
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 80 90 a0 b0  c0 d0 e0 f0
+  // out[3]: 81 91 a1 b1  c1 d1 e1 f1
+  out[0] = _mm_unpacklo_epi16(b0, b1);
+  out[1] = _mm_unpackhi_epi16(b0, b1);
+  out[2] = _mm_unpacklo_epi16(b2, b3);
+  out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+                                                 __m128i* out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]:  00 01 02 03 04 05 06 07
+  // in[1]:  10 11 12 13 14 15 16 17
+  // in[2]:  20 21 22 23 24 25 26 27
+  // in[3]:  30 31 32 33 34 35 36 37
+  // in[4]:  40 41 42 43 44 45 46 47
+  // in[5]:  50 51 52 53 54 55 56 57
+  // in[6]:  60 61 62 63 64 65 66 67
+  // in[7]:  70 71 72 73 74 75 76 77
+  // to:
+  // a0:     00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:     20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:     40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:     60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // b0:     00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  // b1:     40 50 60 70  41 51 61 71  42 52 62 72  43 53 63 73
+  // b2:     04 14 24 34  05 15 25 35  06 16 26 36  07 17 27 37
+  // b3:     44 54 64 74  45 55 65 75  46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // out[0]: 00 10 20 30  40 50 60 70  01 11 21 31  41 51 61 71
+  // out[1]: 02 12 22 32  42 52 62 72  03 13 23 33  43 53 63 73
+  // out[2]: 04 14 24 34  44 54 64 74  05 15 25 35  45 55 65 75
+  // out[3]: 06 16 26 36  46 56 66 76  07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi32(b0, b1);
+  out[1] = _mm_unpackhi_epi32(b0, b1);
+  out[2] = _mm_unpacklo_epi32(b2, b3);
+  out[3] = _mm_unpackhi_epi32(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4_U16(const __m128i* in, __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // ba:    00 10 01 11  02 12 03 13
+  // dc:    20 30 21 31  22 32 23 33
+  const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+  // Unpack 32 bit elements resulting in:
+  // dcba_lo: 00 10 20 30  01 11 21 31
+  // dcba_hi: 02 12 22 32  03 13 23 33
+  const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+  const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+  // Assign or shift right by 8 bytes resulting in:
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  03 13 23 33
+  // out[3]: 03 13 23 33  XX XX XX XX
+  out[0] = dcba_lo;
+  out[1] = _mm_srli_si128(dcba_lo, 8);
+  out[2] = dcba_hi;
+  out[3] = _mm_srli_si128(dcba_hi, 8);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4_U16(const __m128i* in,
+                                                 __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8_U16(const __m128i* in,
+                                                 __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8_U16(const __m128i* const in,
+                                            __m128i* const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
new file mode 100644 (file)
index 0000000..5498052
--- /dev/null
@@ -0,0 +1,535 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// This assumes the two filters contain filter[x] and filter[x+2].
+inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
+                                const __m128i filter_1,
+                                const __m128i& src_window) {
+  const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
+  const __m128i src =
+      _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
+  return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
+}
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+    (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+inline void HorizontalFilter(const int sx4, const int16_t alpha,
+                             const __m128i src_row,
+                             int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  __m128i filter[8];
+  for (__m128i& f : filter) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    f = LoadLo8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+  Transpose8x8To4x16_U8(filter, filter);
+  // |filter| now contains two filters per register.
+  // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
+  // without overflowing the sign bit. The sign bit is hit only where two taps
+  // paired in a single madd add up to more than 128. This is only possible with
+  // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
+  // even guarantees safety. |sum| is given a negative offset to allow for large
+  // intermediate values.
+  // k = 0, 2.
+  __m128i src_row_window = src_row;
+  __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
+  sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
+
+  // k = 1, 3.
+  src_row_window = _mm_srli_si128(src_row_window, 1);
+  sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
+                         _mm_srli_si128(filter[1], 8), src_row_window);
+  // k = 4, 6.
+  src_row_window = _mm_srli_si128(src_row_window, 3);
+  sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
+
+  // k = 5, 7.
+  src_row_window = _mm_srli_si128(src_row_window, 1);
+  sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
+                         _mm_srli_si128(filter[3], 8), src_row_window);
+
+  sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
+  StoreUnaligned16(intermediate_result_row, sum);
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+                                const int16_t intermediate_result[15][8], int y,
+                                void* LIBGAV1_RESTRICT dst_row) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
+  __m128i sum_high = sum_low;
+  for (int k = 0; k < 8; k += 2) {
+    const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+    const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+    const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
+    const __m128i intermediate_1 =
+        LoadUnaligned16(intermediate_result[y + k + 1]);
+    const __m128i intermediate_low =
+        _mm_unpacklo_epi16(intermediate_0, intermediate_1);
+    const __m128i intermediate_high =
+        _mm_unpackhi_epi16(intermediate_0, intermediate_1);
+
+    const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
+    const __m128i product_high =
+        _mm_madd_epi16(filters_high, intermediate_high);
+    sum_low = _mm_add_epi32(sum_low, product_low);
+    sum_high = _mm_add_epi32(sum_high, product_high);
+  }
+  sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+  sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+  if (is_compound) {
+    const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+  } else {
+    const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+  }
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+                                const int16_t* LIBGAV1_RESTRICT
+                                    intermediate_result_column,
+                                void* LIBGAV1_RESTRICT dst_row) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  __m128i sum_low = _mm_setzero_si128();
+  __m128i sum_high = _mm_setzero_si128();
+  for (int k = 0; k < 8; k += 2) {
+    const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+    const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+    // Equivalent to unpacking two vectors made by duplicating int16_t values.
+    const __m128i intermediate =
+        _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
+                       intermediate_result_column[k]);
+    const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
+    const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
+    sum_low = _mm_add_epi32(sum_low, product_low);
+    sum_high = _mm_add_epi32(sum_high, product_high);
+  }
+  sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+  sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+  if (is_compound) {
+    const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+  } else {
+    const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
+                           int delta, DestType* LIBGAV1_RESTRICT dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
+                           int64_t y4, int gamma, int delta,
+                           DestType* LIBGAV1_RESTRICT dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int source_width,
+                        int source_height, int ix4, int iy4,
+                        DestType* LIBGAV1_RESTRICT dst_row,
+                        ptrdiff_t dest_stride) {
+  // Region 1
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Region 1.
+  // Every sample used to calculate the prediction block has the same
+  // value. So the whole prediction block has the same value.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+  if (is_compound) {
+    const __m128i sum =
+        _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+                                            kInterRoundBitsCompoundVertical));
+    StoreUnaligned16(dst_row, sum);
+  } else {
+    memset(dst_row, row_border_pixel, 8);
+  }
+  const DestType* const first_dst_row = dst_row;
+  dst_row += dest_stride;
+  for (int y = 1; y < 8; ++y) {
+    memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+    dst_row += dest_stride;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int source_width, int64_t y4,
+                        int ix4, int iy4, int gamma, int delta,
+                        int16_t intermediate_result_column[15],
+                        DestType* LIBGAV1_RESTRICT dst_row,
+                        ptrdiff_t dest_stride) {
+  // Region 2.
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+
+  // Region 2.
+  // Horizontal filter.
+  // The input values in this region are generated by extending the border
+  // which makes them identical in the horizontal direction. This
+  // computation could be inlined in the vertical pass but most
+  // implementations will need a transpose of some sort.
+  // It is not necessary to use the offset values here because the
+  // horizontal pass is a simple shift and the vertical pass will always
+  // require using 32 bits.
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    int sum = first_row_border[row * source_stride];
+    sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+    intermediate_result_column[y + 7] = sum;
+  }
+  // Region 2 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+                                        delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int source_height, int alpha,
+                        int beta, int64_t x4, int ix4, int iy4,
+                        int16_t intermediate_result[15][8]) {
+  // Region 3
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t* const src_row = src + row * source_stride;
+  // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+  // read but is ignored.
+  //
+  // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+  // bytes after src_row[source_width - 1]. We assume the source frame
+  // has left and right borders of at least 13 bytes that extend the
+  // frame boundary pixels. We also assume there is at least one extra
+  // padding byte after the right border of the last source row.
+  const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int alpha, int beta,
+                        int64_t x4, int ix4, int iy4,
+                        int16_t intermediate_result[15][8]) {
+  // Region 4.
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    const uint8_t* const src_row = src + row * source_stride;
+    // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+    // read but is ignored.
+    //
+    // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+    // bytes after src_row[source_width - 1]. We assume the source frame
+    // has left and right borders of at least 13 bytes that extend the
+    // frame boundary pixels. We also assume there is at least one extra
+    // padding byte after the right border of the last source row.
+    const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+    // Convert src_row_v to int8 (subtract 128).
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
+                            ptrdiff_t source_stride, int source_width,
+                            int source_height,
+                            const int* LIBGAV1_RESTRICT warp_params,
+                            int subsampling_x, int subsampling_y, int src_x,
+                            int src_y, int16_t alpha, int16_t beta,
+                            int16_t gamma, int16_t delta,
+                            DestType* LIBGAV1_RESTRICT dst_row,
+                            ptrdiff_t dest_stride) {
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can start with a negative offset and restore it on the
+    // final filter sum.
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const WarpFilterParams filter_params = GetWarpFilterParams(
+      src_x, src_y, subsampling_x, subsampling_y, warp_params);
+  // A prediction block may fall outside the frame's boundaries. If a
+  // prediction block is calculated using only samples outside the frame's
+  // boundary, the filtering can be simplified. We can divide the plane
+  // into several regions and handle them differently.
+  //
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //         -------+-----------+-------
+  //                |***********|
+  //            2   |*****4*****|   2
+  //                |***********|
+  //         -------+-----------+-------
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //
+  // At the center, region 4 represents the frame and is the general case.
+  //
+  // In regions 1 and 2, the prediction block is outside the frame's
+  // boundary horizontally. Therefore the horizontal filtering can be
+  // simplified. Furthermore, in the region 1 (at the four corners), the
+  // prediction is outside the frame's boundary both horizontally and
+  // vertically, so we get a constant prediction block.
+  //
+  // In region 3, the prediction block is outside the frame's boundary
+  // vertically. Unfortunately because we apply the horizontal filters
+  // first, by the time we apply the vertical filters, they no longer see
+  // simple inputs. So the only simplification is that all the rows are
+  // the same, but we still need to apply all the horizontal and vertical
+  // filters.
+
+  // Check for two simple special cases, where the horizontal filter can
+  // be significantly simplified.
+  //
+  // In general, for each row, the horizontal filter is calculated as
+  // follows:
+  //   for (int x = -4; x < 4; ++x) {
+  //     const int offset = ...;
+  //     int sum = first_pass_offset;
+  //     for (int k = 0; k < 8; ++k) {
+  //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+  //       sum += kWarpedFilters[offset][k] * src_row[column];
+  //     }
+  //     ...
+  //   }
+  // The column index before clipping, ix4 + x + k - 3, varies in the range
+  // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+  // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+  // border index (source_width - 1 or 0, respectively). Then for each x,
+  // the inner for loop of the horizontal filter is reduced to multiplying
+  // the border pixel by the sum of the filter coefficients.
+  if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
+    if ((filter_params.iy4 - 7 >= source_height - 1 ||
+         filter_params.iy4 + 7 <= 0)) {
+      // Outside the frame in both directions. One repeated value.
+      WarpRegion1<is_compound, DestType>(
+          src, source_stride, source_width, source_height, filter_params.ix4,
+          filter_params.iy4, dst_row, dest_stride);
+      return;
+    }
+    // Outside the frame horizontally. Rows repeated.
+    WarpRegion2<is_compound, DestType>(
+        src, source_stride, source_width, filter_params.y4, filter_params.ix4,
+        filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
+        dest_stride);
+    return;
+  }
+
+  if ((filter_params.iy4 - 7 >= source_height - 1 ||
+       filter_params.iy4 + 7 <= 0)) {
+    // Outside the frame vertically.
+    WarpRegion3<is_compound, DestType>(
+        src, source_stride, source_height, alpha, beta, filter_params.x4,
+        filter_params.ix4, filter_params.iy4, intermediate_result);
+  } else {
+    // Inside the frame.
+    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
+                                       filter_params.x4, filter_params.ix4,
+                                       filter_params.iy4, intermediate_result);
+  }
+  // Region 3 and 4 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
+                                        gamma, delta, dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* LIBGAV1_RESTRICT source, ptrdiff_t source_stride,
+                 int source_width, int source_height,
+                 const int* LIBGAV1_RESTRICT warp_params, int subsampling_x,
+                 int subsampling_y, int block_start_x, int block_start_y,
+                 int block_width, int block_height, int16_t alpha, int16_t beta,
+                 int16_t gamma, int16_t delta, void* LIBGAV1_RESTRICT dest,
+                 ptrdiff_t dest_stride) {
+  const auto* const src = static_cast<const uint8_t*>(source);
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint8_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+
+  // Warp process applies for each 8x8 block.
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+  const int block_end_x = block_start_x + block_width;
+  const int block_end_y = block_start_y + block_height;
+
+  const int start_x = block_start_x;
+  const int start_y = block_start_y;
+  int src_x = (start_x + 4) << subsampling_x;
+  int src_y = (start_y + 4) << subsampling_y;
+  const int end_x = (block_end_x + 4) << subsampling_x;
+  const int end_y = (block_end_y + 4) << subsampling_y;
+  do {
+    DestType* dst_row = dst;
+    src_x = (start_x + 4) << subsampling_x;
+    do {
+      HandleWarpBlock<is_compound, DestType>(
+          src, source_stride, source_width, source_height, warp_params,
+          subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+          dst_row, dest_stride);
+      src_x += (8 << subsampling_x);
+      dst_row += 8;
+    } while (src_x < end_x);
+    dst += 8 * dest_stride;
+    src_y += (8 << subsampling_y);
+  } while (src_y < end_y);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
+  dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/warp_sse4.h b/src/dsp/x86/warp_sse4.h
new file mode 100644 (file)
index 0000000..a2dc5ca
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
new file mode 100644 (file)
index 0000000..53a374d
--- /dev/null
@@ -0,0 +1,1011 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/weight_mask_sse4.h"
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i difference_0 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp);
+  const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i difference_1 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp);
+  const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4);
+
+  const __m128i difference_offset = _mm_set1_epi8(38);
+  const __m128i adjusted_difference =
+      _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1),
+                    difference_offset);
+  const __m128i mask_ceiling = _mm_set1_epi8(64);
+  const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
+  if (mask_is_inverse) {
+    const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
+  } else {
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
+  }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE \
+  WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE \
+  WEIGHT8_PAIR_WITHOUT_STRIDE;  \
+  pred_0 += 8 << 1;             \
+  pred_1 += 8 << 1;             \
+  mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 3;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 5;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
+
+#define WEIGHT16_AND_STRIDE \
+  WEIGHT16_WITHOUT_STRIDE;  \
+  pred_0 += 16;             \
+  pred_1 += 16;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 7;
+  do {
+    WEIGHT16_AND_STRIDE;
+  } while (--y != 0);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT16_AND_STRIDE;
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE                                        \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                             mask_stride);             \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                             mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE \
+  WEIGHT32_WITHOUT_STRIDE;  \
+  pred_0 += 32;             \
+  pred_1 += 32;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE                                        \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                             mask_stride);             \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                             mask + 16, mask_stride);  \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                             mask + 32, mask_stride);  \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                             mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE \
+  WEIGHT64_WITHOUT_STRIDE;  \
+  pred_0 += 64;             \
+  pred_1 += 64;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask,
+                             ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 42);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask,
+                             ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                              const void* LIBGAV1_RESTRICT prediction_1,
+                              uint8_t* LIBGAV1_RESTRICT mask,
+                              ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 42);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                      \
+      WeightMask##width##x##height##_SSE4_1<0>;                \
+  dsp->weight_mask[w_index][h_index][1] =                      \
+      WeightMask##width##x##height##_SSE4_1<1>
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kRoundingBits10bpp = 6;
+constexpr int kScaledDiffShift = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_10bpp_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT prediction_0,
+    const uint16_t* LIBGAV1_RESTRICT prediction_1,
+    uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+  const __m128i diff_offset = _mm_set1_epi8(38);
+  const __m128i mask_ceiling = _mm_set1_epi8(64);
+  const __m128i zero = _mm_setzero_si128();
+
+  // Range of prediction: [3988, 61532].
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00);
+  const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10);
+  const __m128i diff_lo_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero);
+  const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero);
+  const __m128i diff_hi_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp);
+
+  const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0);
+  const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01);
+  const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11);
+  const __m128i diff_lo_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero);
+  const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero);
+  const __m128i diff_hi_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp);
+
+  const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1);
+  const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift);
+
+  const __m128i adjusted_diff = _mm_adds_epu8(
+      _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset);
+  const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling);
+
+  if (mask_is_inverse) {
+    const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
+  } else {
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
+  }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                                 \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \
+                                                    mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 8 << 1;                   \
+  pred_1 += 8 << 1;                   \
+  mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 3;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 5;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE_10BPP                                    \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+                                                   mask_stride)
+
+#define WEIGHT16_AND_STRIDE_10BPP \
+  WEIGHT16_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 16;                   \
+  pred_1 += 16;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = 7;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT16_AND_STRIDE_10BPP;
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE_10BPP                                        \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                   mask_stride);             \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                   mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE_10BPP \
+  WEIGHT32_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 32;                   \
+  pred_1 += 32;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE_10BPP                                        \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                   mask_stride);             \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                   mask + 16, mask_stride);  \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                                   mask + 32, mask_stride);  \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                                   mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE_10BPP \
+  WEIGHT64_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 64;                   \
+  pred_1 += 64;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                   const void* LIBGAV1_RESTRICT prediction_1,
+                                   uint8_t* LIBGAV1_RESTRICT mask,
+                                   ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                   const void* LIBGAV1_RESTRICT prediction_1,
+                                   uint8_t* LIBGAV1_RESTRICT mask,
+                                   ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                    const void* LIBGAV1_RESTRICT prediction_1,
+                                    uint8_t* LIBGAV1_RESTRICT mask,
+                                    ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4_1<0>;           \
+  dsp->weight_mask[w_index][h_index][1] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4_1<1>
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WeightMaskInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h
new file mode 100644 (file)
index 0000000..e5d9d70
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
diff --git a/src/film_grain.cc b/src/film_grain.cc
new file mode 100644 (file)
index 0000000..44a2543
--- /dev/null
@@ -0,0 +1,831 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+namespace {
+
+// The kGaussianSequence array contains random samples from a Gaussian
+// distribution with zero mean and standard deviation of about 512 clipped to
+// the range of [-2048, 2047] (representable by a signed integer using 12 bits
+// of precision) and rounded to the nearest multiple of 4.
+//
+// Note: It is important that every element in the kGaussianSequence array be
+// less than 2040, so that RightShiftWithRounding(kGaussianSequence[i], 4) is
+// less than 128 for bitdepth=8 (GrainType=int8_t).
+constexpr int16_t kGaussianSequence[/*2048*/] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484};
+static_assert(sizeof(kGaussianSequence) / sizeof(kGaussianSequence[0]) == 2048,
+              "");
+
+// The number of rows in a contiguous group computed by a single worker thread
+// before checking for the next available group.
+constexpr int kFrameChunkHeight = 8;
+
+// |width| and |height| refer to the plane, not the frame, meaning any
+// subsampling should be applied by the caller.
+template <typename Pixel>
+inline void CopyImagePlane(const uint8_t* source_plane, ptrdiff_t source_stride,
+                           int width, int height, uint8_t* dest_plane,
+                           ptrdiff_t dest_stride) {
+  // If it's the same buffer there's nothing to do.
+  if (source_plane == dest_plane) return;
+
+  int y = 0;
+  do {
+    memcpy(dest_plane, source_plane, width * sizeof(Pixel));
+    source_plane += source_stride;
+    dest_plane += dest_stride;
+  } while (++y < height);
+}
+
+}  // namespace
+
+template <int bitdepth>
+FilmGrain<bitdepth>::FilmGrain(const FilmGrainParams& params,
+                               bool is_monochrome,
+                               bool color_matrix_is_identity, int subsampling_x,
+                               int subsampling_y, int width, int height,
+                               ThreadPool* thread_pool)
+    : params_(params),
+      is_monochrome_(is_monochrome),
+      color_matrix_is_identity_(color_matrix_is_identity),
+      subsampling_x_(subsampling_x),
+      subsampling_y_(subsampling_y),
+      width_(width),
+      height_(height),
+      template_uv_width_((subsampling_x != 0) ? kMinChromaWidth
+                                              : kMaxChromaWidth),
+      template_uv_height_((subsampling_y != 0) ? kMinChromaHeight
+                                               : kMaxChromaHeight),
+      thread_pool_(thread_pool) {}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::Init() {
+  // Section 7.18.3.3. Generate grain process.
+  const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+  // If params_.num_y_points is 0, luma_grain_ will never be read, so we don't
+  // need to generate it.
+  const bool use_luma = params_.num_y_points > 0;
+  if (use_luma) {
+    GenerateLumaGrain(params_, luma_grain_);
+    // If params_.auto_regression_coeff_lag is 0, the filter is the identity
+    // filter and therefore can be skipped.
+    if (params_.auto_regression_coeff_lag > 0) {
+      dsp.film_grain
+          .luma_auto_regression[params_.auto_regression_coeff_lag - 1](
+              params_, luma_grain_);
+    }
+  } else {
+    // Have AddressSanitizer warn if luma_grain_ is used.
+    ASAN_POISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_));
+  }
+  if (!is_monochrome_) {
+    GenerateChromaGrains(params_, template_uv_width_, template_uv_height_,
+                         u_grain_, v_grain_);
+    if (params_.auto_regression_coeff_lag > 0 || use_luma) {
+      dsp.film_grain.chroma_auto_regression[static_cast<int>(
+          use_luma)][params_.auto_regression_coeff_lag](
+          params_, luma_grain_, subsampling_x_, subsampling_y_, u_grain_,
+          v_grain_);
+    }
+  }
+
+  // Section 7.18.3.4. Scaling lookup initialization process.
+
+  // Initialize scaling_lut_y_. If params_.num_y_points > 0, scaling_lut_y_
+  // is used for the Y plane. If params_.chroma_scaling_from_luma is true,
+  // scaling_lut_u_ and scaling_lut_v_ are the same as scaling_lut_y_ and are
+  // set up as aliases. So we need to initialize scaling_lut_y_ under these
+  // two conditions.
+  //
+  // Note: Although it does not seem to make sense, there are test vectors
+  // with chroma_scaling_from_luma=true and params_.num_y_points=0.
+#if LIBGAV1_MSAN
+  // Quiet film grain / md5 msan warnings.
+  memset(scaling_lut_y_, 0, sizeof(scaling_lut_y_));
+#endif
+  if (use_luma || params_.chroma_scaling_from_luma) {
+    dsp.film_grain.initialize_scaling_lut(
+        params_.num_y_points, params_.point_y_value, params_.point_y_scaling,
+        scaling_lut_y_, kScalingLutLength);
+  } else {
+    ASAN_POISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_));
+  }
+  if (!is_monochrome_) {
+    if (params_.chroma_scaling_from_luma) {
+      scaling_lut_u_ = scaling_lut_y_;
+      scaling_lut_v_ = scaling_lut_y_;
+    } else if (params_.num_u_points > 0 || params_.num_v_points > 0) {
+      const size_t buffer_size =
+          kScalingLutLength * (static_cast<int>(params_.num_u_points > 0) +
+                               static_cast<int>(params_.num_v_points > 0));
+      scaling_lut_chroma_buffer_.reset(new (std::nothrow) int16_t[buffer_size]);
+      if (scaling_lut_chroma_buffer_ == nullptr) return false;
+
+      int16_t* buffer = scaling_lut_chroma_buffer_.get();
+#if LIBGAV1_MSAN
+      // Quiet film grain / md5 msan warnings.
+      memset(buffer, 0, buffer_size * 2);
+#endif
+      if (params_.num_u_points > 0) {
+        scaling_lut_u_ = buffer;
+        dsp.film_grain.initialize_scaling_lut(
+            params_.num_u_points, params_.point_u_value,
+            params_.point_u_scaling, scaling_lut_u_, kScalingLutLength);
+        buffer += kScalingLutLength;
+      }
+      if (params_.num_v_points > 0) {
+        scaling_lut_v_ = buffer;
+        dsp.film_grain.initialize_scaling_lut(
+            params_.num_v_points, params_.point_v_value,
+            params_.point_v_scaling, scaling_lut_v_, kScalingLutLength);
+      }
+    }
+  }
+  return true;
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateLumaGrain(const FilmGrainParams& params,
+                                            GrainType* luma_grain) {
+  // If params.num_y_points is equal to 0, Section 7.18.3.3 specifies we set
+  // the luma_grain array to all zeros. But the Note at the end of Section
+  // 7.18.3.3 says luma_grain "will never be read in this case". So we don't
+  // call GenerateLumaGrain if params.num_y_points is equal to 0.
+  assert(params.num_y_points > 0);
+  const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
+  uint16_t seed = params.grain_seed;
+  GrainType* luma_grain_row = luma_grain;
+  for (int y = 0; y < kLumaHeight; ++y) {
+    for (int x = 0; x < kLumaWidth; ++x) {
+      luma_grain_row[x] = RightShiftWithRounding(
+          kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+    }
+    luma_grain_row += kLumaWidth;
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateChromaGrains(const FilmGrainParams& params,
+                                               int chroma_width,
+                                               int chroma_height,
+                                               GrainType* u_grain,
+                                               GrainType* v_grain) {
+  const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
+  if (params.num_u_points == 0 && !params.chroma_scaling_from_luma) {
+    memset(u_grain, 0, chroma_height * chroma_width * sizeof(*u_grain));
+  } else {
+    uint16_t seed = params.grain_seed ^ 0xb524;
+    GrainType* u_grain_row = u_grain;
+    assert(chroma_width > 0);
+    assert(chroma_height > 0);
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        u_grain_row[x] = RightShiftWithRounding(
+            kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+      } while (++x < chroma_width);
+
+      u_grain_row += chroma_width;
+    } while (++y < chroma_height);
+  }
+  if (params.num_v_points == 0 && !params.chroma_scaling_from_luma) {
+    memset(v_grain, 0, chroma_height * chroma_width * sizeof(*v_grain));
+  } else {
+    GrainType* v_grain_row = v_grain;
+    uint16_t seed = params.grain_seed ^ 0x49d8;
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        v_grain_row[x] = RightShiftWithRounding(
+            kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+      } while (++x < chroma_width);
+
+      v_grain_row += chroma_width;
+    } while (++y < chroma_height);
+  }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseStripes() {
+  const int half_height = DivideBy2(height_ + 1);
+  assert(half_height > 0);
+  // ceil(half_height / 16.0)
+  const int max_luma_num = DivideBy16(half_height + 15);
+  constexpr int kNoiseStripeHeight = 34;
+  size_t noise_buffer_size = kNoiseStripePadding;
+  if (params_.num_y_points > 0) {
+    noise_buffer_size += max_luma_num * kNoiseStripeHeight * width_;
+  }
+  if (!is_monochrome_) {
+    noise_buffer_size += 2 * max_luma_num *
+                         (kNoiseStripeHeight >> subsampling_y_) *
+                         SubsampledValue(width_, subsampling_x_);
+  }
+  noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
+  if (noise_buffer_ == nullptr) return false;
+  GrainType* noise_buffer = noise_buffer_.get();
+  if (params_.num_y_points > 0) {
+    noise_stripes_[kPlaneY].Reset(max_luma_num, kNoiseStripeHeight * width_,
+                                  noise_buffer);
+    noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
+  }
+  if (!is_monochrome_) {
+    noise_stripes_[kPlaneU].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
+    noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
+                    SubsampledValue(width_, subsampling_x_);
+    noise_stripes_[kPlaneV].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
+  }
+  return true;
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseImage() {
+  // When LIBGAV1_MSAN is enabled, zero initialize to quiet optimized film grain
+  // msan warnings.
+  constexpr bool zero_initialize = LIBGAV1_MSAN == 1;
+  if (params_.num_y_points > 0 &&
+      !noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
+                                   zero_initialize)) {
+    return false;
+  }
+  if (!is_monochrome_) {
+    if (!noise_image_[kPlaneU].Reset(
+            (height_ + subsampling_y_) >> subsampling_y_,
+            ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+            zero_initialize)) {
+      return false;
+    }
+    if (!noise_image_[kPlaneV].Reset(
+            (height_ + subsampling_y_) >> subsampling_y_,
+            ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+            zero_initialize)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Uses |overlap_flag| to skip rows that are covered by the overlap computation.
+template <int bitdepth>
+void FilmGrain<bitdepth>::ConstructNoiseImage(
+    const Array2DView<GrainType>* noise_stripes, int width, int height,
+    int subsampling_x, int subsampling_y, int stripe_start_offset,
+    Array2D<GrainType>* noise_image) {
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = 0;
+  // |luma_num| = y >> (5 - |subsampling_y|). Hence |luma_num| == 0 for all y up
+  // to either 16 or 32.
+  const GrainType* first_noise_stripe = (*noise_stripes)[0];
+  do {
+    memcpy((*noise_image)[y], first_noise_stripe + y * plane_width,
+           plane_width * sizeof(first_noise_stripe[0]));
+  } while (++y < std::min(stripe_height, plane_height));
+  // End special iterations for luma_num == 0.
+
+  int luma_num = 1;
+  for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    int i = stripe_start_offset;
+    do {
+      memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+             plane_width * sizeof(noise_stripe[0]));
+    } while (++i < stripe_height);
+  }
+
+  // If there is a partial stripe, copy any rows beyond the overlap rows.
+  const int remaining_height = plane_height - y;
+  if (remaining_height > stripe_start_offset) {
+    assert(luma_num < noise_stripes->rows());
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    int i = stripe_start_offset;
+    do {
+      memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+             plane_width * sizeof(noise_stripe[0]));
+    } while (++i < remaining_height);
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseChromaWorker(
+    const dsp::Dsp& dsp, const Plane* planes, int num_planes,
+    std::atomic<int>* job_counter, int min_value, int max_chroma,
+    const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+    ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, uint8_t* dest_plane_v,
+    ptrdiff_t dest_stride_uv) {
+  assert(num_planes > 0);
+  const int full_jobs_per_plane = height_ / kFrameChunkHeight;
+  const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+  const int total_full_jobs = full_jobs_per_plane * num_planes;
+  // If the frame height is not a multiple of kFrameChunkHeight, one job with
+  // a smaller number of rows is necessary at the end of each plane.
+  const int total_jobs =
+      total_full_jobs + ((remainder_job_height == 0) ? 0 : num_planes);
+  int job_index;
+  // Each job corresponds to a slice of kFrameChunkHeight rows in the luma
+  // plane. dsp->blend_noise_chroma handles subsampling.
+  // This loop body handles a slice of one plane or the other, depending on
+  // which are active. That way, threads working on consecutive jobs will keep
+  // the same region of luma source in working memory.
+  while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+         total_jobs) {
+    const Plane plane = planes[job_index % num_planes];
+    const int slice_index = job_index / num_planes;
+    const int start_height = slice_index * kFrameChunkHeight;
+    const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+    const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+        source_plane_y + start_height * source_stride_y);
+    const int16_t* scaling_lut_uv;
+    const uint8_t* source_plane_uv;
+    uint8_t* dest_plane_uv;
+
+    if (plane == kPlaneU) {
+      scaling_lut_uv = scaling_lut_u_;
+      source_plane_uv = source_plane_u;
+      dest_plane_uv = dest_plane_u;
+    } else {
+      assert(plane == kPlaneV);
+      scaling_lut_uv = scaling_lut_v_;
+      source_plane_uv = source_plane_v;
+      dest_plane_uv = dest_plane_v;
+    }
+    const auto* source_cursor_uv = reinterpret_cast<const Pixel*>(
+        source_plane_uv + (start_height >> subsampling_y_) * source_stride_uv);
+    auto* dest_cursor_uv = reinterpret_cast<Pixel*>(
+        dest_plane_uv + (start_height >> subsampling_y_) * dest_stride_uv);
+    dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+        plane, params_, noise_image_, min_value, max_chroma, width_, job_height,
+        start_height, subsampling_x_, subsampling_y_, scaling_lut_uv,
+        source_cursor_y, source_stride_y, source_cursor_uv, source_stride_uv,
+        dest_cursor_uv, dest_stride_uv);
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseLumaWorker(
+    const dsp::Dsp& dsp, std::atomic<int>* job_counter, int min_value,
+    int max_luma, const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    uint8_t* dest_plane_y, ptrdiff_t dest_stride_y) {
+  const int total_full_jobs = height_ / kFrameChunkHeight;
+  const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+  const int total_jobs =
+      total_full_jobs + static_cast<int>(remainder_job_height > 0);
+  int job_index;
+  // Each job is some number of rows in a plane.
+  while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+         total_jobs) {
+    const int start_height = job_index * kFrameChunkHeight;
+    const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+    const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+        source_plane_y + start_height * source_stride_y);
+    auto* dest_cursor_y =
+        reinterpret_cast<Pixel*>(dest_plane_y + start_height * dest_stride_y);
+    dsp.film_grain.blend_noise_luma(
+        noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+        job_height, start_height, scaling_lut_y_, source_cursor_y,
+        source_stride_y, dest_cursor_y, dest_stride_y);
+  }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AddNoise(
+    const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+    ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, ptrdiff_t dest_stride_y,
+    uint8_t* dest_plane_u, uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv) {
+  if (!Init()) {
+    LIBGAV1_DLOG(ERROR, "Init() failed.");
+    return false;
+  }
+  if (!AllocateNoiseStripes()) {
+    LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
+    return false;
+  }
+
+  const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+  const bool use_luma = params_.num_y_points > 0;
+
+  // Construct noise stripes.
+  if (use_luma) {
+    // The luma plane is never subsampled.
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            luma_grain_, params_.grain_seed, width_, height_,
+            /*subsampling_x=*/0, /*subsampling_y=*/0, &noise_stripes_[kPlaneY]);
+  }
+  if (!is_monochrome_) {
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            u_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+            subsampling_y_, &noise_stripes_[kPlaneU]);
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            v_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+            subsampling_y_, &noise_stripes_[kPlaneV]);
+  }
+
+  if (!AllocateNoiseImage()) {
+    LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
+    return false;
+  }
+
+  // Construct noise image.
+  if (use_luma) {
+    ConstructNoiseImage(
+        &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+        /*subsampling_y=*/0, static_cast<int>(params_.overlap_flag) << 1,
+        &noise_image_[kPlaneY]);
+    if (params_.overlap_flag) {
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+          /*subsampling_y=*/0, &noise_image_[kPlaneY]);
+    }
+  }
+  if (!is_monochrome_) {
+    ConstructNoiseImage(&noise_stripes_[kPlaneU], width_, height_,
+                        subsampling_x_, subsampling_y_,
+                        static_cast<int>(params_.overlap_flag)
+                            << (1 - subsampling_y_),
+                        &noise_image_[kPlaneU]);
+    ConstructNoiseImage(&noise_stripes_[kPlaneV], width_, height_,
+                        subsampling_x_, subsampling_y_,
+                        static_cast<int>(params_.overlap_flag)
+                            << (1 - subsampling_y_),
+                        &noise_image_[kPlaneV]);
+    if (params_.overlap_flag) {
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneU], width_, height_, subsampling_x_,
+          subsampling_y_, &noise_image_[kPlaneU]);
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneV], width_, height_, subsampling_x_,
+          subsampling_y_, &noise_image_[kPlaneV]);
+    }
+  }
+
+  // Blend noise image.
+  int min_value;
+  int max_luma;
+  int max_chroma;
+  if (params_.clip_to_restricted_range) {
+    min_value = 16 << (bitdepth - kBitdepth8);
+    max_luma = 235 << (bitdepth - kBitdepth8);
+    if (color_matrix_is_identity_) {
+      max_chroma = max_luma;
+    } else {
+      max_chroma = 240 << (bitdepth - kBitdepth8);
+    }
+  } else {
+    min_value = 0;
+    max_luma = (256 << (bitdepth - kBitdepth8)) - 1;
+    max_chroma = max_luma;
+  }
+
+  // Handle all chroma planes first because luma source may be altered in place.
+  if (!is_monochrome_) {
+    // This is done in a strange way but Vector can't be passed by copy to the
+    // lambda capture that spawns the thread.
+    Plane planes_to_blend[2];
+    int num_planes = 0;
+    if (params_.chroma_scaling_from_luma) {
+      // Both noise planes are computed from the luma scaling lookup table.
+      planes_to_blend[num_planes++] = kPlaneU;
+      planes_to_blend[num_planes++] = kPlaneV;
+    } else {
+      const int height_uv = SubsampledValue(height_, subsampling_y_);
+      const int width_uv = SubsampledValue(width_, subsampling_x_);
+
+      // Noise is applied according to a lookup table defined by pieceiwse
+      // linear "points." If the lookup table is empty, that corresponds to
+      // outputting zero noise.
+      if (params_.num_u_points == 0) {
+        CopyImagePlane<Pixel>(source_plane_u, source_stride_uv, width_uv,
+                              height_uv, dest_plane_u, dest_stride_uv);
+      } else {
+        planes_to_blend[num_planes++] = kPlaneU;
+      }
+      if (params_.num_v_points == 0) {
+        CopyImagePlane<Pixel>(source_plane_v, source_stride_uv, width_uv,
+                              height_uv, dest_plane_v, dest_stride_uv);
+      } else {
+        planes_to_blend[num_planes++] = kPlaneV;
+      }
+    }
+    if (thread_pool_ != nullptr && num_planes > 0) {
+      const int num_workers = thread_pool_->num_threads();
+      BlockingCounter pending_workers(num_workers);
+      std::atomic<int> job_counter(0);
+      for (int i = 0; i < num_workers; ++i) {
+        thread_pool_->Schedule([this, dsp, &pending_workers, &planes_to_blend,
+                                num_planes, &job_counter, min_value, max_chroma,
+                                source_plane_y, source_stride_y, source_plane_u,
+                                source_plane_v, source_stride_uv, dest_plane_u,
+                                dest_plane_v, dest_stride_uv]() {
+          BlendNoiseChromaWorker(dsp, planes_to_blend, num_planes, &job_counter,
+                                 min_value, max_chroma, source_plane_y,
+                                 source_stride_y, source_plane_u,
+                                 source_plane_v, source_stride_uv, dest_plane_u,
+                                 dest_plane_v, dest_stride_uv);
+          pending_workers.Decrement();
+        });
+      }
+      BlendNoiseChromaWorker(
+          dsp, planes_to_blend, num_planes, &job_counter, min_value, max_chroma,
+          source_plane_y, source_stride_y, source_plane_u, source_plane_v,
+          source_stride_uv, dest_plane_u, dest_plane_v, dest_stride_uv);
+
+      pending_workers.Wait();
+    } else {
+      // Single threaded.
+      if (params_.num_u_points > 0 || params_.chroma_scaling_from_luma) {
+        dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+            kPlaneU, params_, noise_image_, min_value, max_chroma, width_,
+            height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+            scaling_lut_u_, source_plane_y, source_stride_y, source_plane_u,
+            source_stride_uv, dest_plane_u, dest_stride_uv);
+      }
+      if (params_.num_v_points > 0 || params_.chroma_scaling_from_luma) {
+        dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+            kPlaneV, params_, noise_image_, min_value, max_chroma, width_,
+            height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+            scaling_lut_v_, source_plane_y, source_stride_y, source_plane_v,
+            source_stride_uv, dest_plane_v, dest_stride_uv);
+      }
+    }
+  }
+  if (use_luma) {
+    if (thread_pool_ != nullptr) {
+      const int num_workers = thread_pool_->num_threads();
+      BlockingCounter pending_workers(num_workers);
+      std::atomic<int> job_counter(0);
+      for (int i = 0; i < num_workers; ++i) {
+        thread_pool_->Schedule(
+            [this, dsp, &pending_workers, &job_counter, min_value, max_luma,
+             source_plane_y, source_stride_y, dest_plane_y, dest_stride_y]() {
+              BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+                                   source_plane_y, source_stride_y,
+                                   dest_plane_y, dest_stride_y);
+              pending_workers.Decrement();
+            });
+      }
+
+      BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+                           source_plane_y, source_stride_y, dest_plane_y,
+                           dest_stride_y);
+      pending_workers.Wait();
+    } else {
+      dsp.film_grain.blend_noise_luma(
+          noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+          height_, /*start_height=*/0, scaling_lut_y_, source_plane_y,
+          source_stride_y, dest_plane_y, dest_stride_y);
+    }
+  } else {
+    CopyImagePlane<Pixel>(source_plane_y, source_stride_y, width_, height_,
+                          dest_plane_y, dest_stride_y);
+  }
+
+  return true;
+}
+
+// Explicit instantiations.
+template class FilmGrain<kBitdepth8>;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template class FilmGrain<kBitdepth10>;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+template class FilmGrain<kBitdepth12>;
+#endif
+
+}  // namespace libgav1
diff --git a/src/film_grain.h b/src/film_grain.h
new file mode 100644 (file)
index 0000000..bda8458
--- /dev/null
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FILM_GRAIN_H_
+#define LIBGAV1_SRC_FILM_GRAIN_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// Film grain synthesis function signature. Section 7.18.3.
+// This function generates film grain noise and blends the noise with the
+// decoded frame.
+// |source_plane_y|, |source_plane_u|, and |source_plane_v| are the plane
+// buffers of the decoded frame. They are blended with the film grain noise and
+// written to |dest_plane_y|, |dest_plane_u|, and |dest_plane_v| as final
+// output for display. |source_plane_p| and |dest_plane_p| (where p is y, u, or
+// v) may point to the same buffer, in which case the film grain noise is added
+// in place.
+// |film_grain_params| are parameters read from frame header.
+// |is_monochrome| is true indicates only Y plane needs to be processed.
+// |color_matrix_is_identity| is true if the matrix_coefficients field in the
+// sequence header's color config is is MC_IDENTITY.
+// |width| is the upscaled width of the frame.
+// |height| is the frame height.
+// |subsampling_x| and |subsampling_y| are subsamplings for UV planes, not used
+// if |is_monochrome| is true.
+// Returns true on success, or false on failure (e.g., out of memory).
+using FilmGrainSynthesisFunc = bool (*)(
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_u, ptrdiff_t source_stride_u,
+    const void* source_plane_v, ptrdiff_t source_stride_v,
+    const FilmGrainParams& film_grain_params, bool is_monochrome,
+    bool color_matrix_is_identity, int width, int height, int subsampling_x,
+    int subsampling_y, void* dest_plane_y, ptrdiff_t dest_stride_y,
+    void* dest_plane_u, ptrdiff_t dest_stride_u, void* dest_plane_v,
+    ptrdiff_t dest_stride_v);
+
+// Section 7.18.3.5. Add noise synthesis process.
+template <int bitdepth>
+class FilmGrain {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  FilmGrain(const FilmGrainParams& params, bool is_monochrome,
+            bool color_matrix_is_identity, int subsampling_x, int subsampling_y,
+            int width, int height, ThreadPool* thread_pool);
+
+  // Note: These static methods are declared public so that the unit tests can
+  // call them.
+
+  static void GenerateLumaGrain(const FilmGrainParams& params,
+                                GrainType* luma_grain);
+
+  // Generates white noise arrays u_grain and v_grain chroma_width samples wide
+  // and chroma_height samples high.
+  static void GenerateChromaGrains(const FilmGrainParams& params,
+                                   int chroma_width, int chroma_height,
+                                   GrainType* u_grain, GrainType* v_grain);
+
+  // Copies rows from |noise_stripes| to |noise_image|, skipping rows that are
+  // subject to overlap.
+  static void ConstructNoiseImage(const Array2DView<GrainType>* noise_stripes,
+                                  int width, int height, int subsampling_x,
+                                  int subsampling_y, int stripe_start_offset,
+                                  Array2D<GrainType>* noise_image);
+
+  // Combines the film grain with the image data.
+  bool AddNoise(const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+                const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+                ptrdiff_t source_stride_uv, uint8_t* dest_plane_y,
+                ptrdiff_t dest_stride_y, uint8_t* dest_plane_u,
+                uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+ private:
+  using Pixel =
+      typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type;
+  static constexpr int kScalingLutLength =
+      (bitdepth == 10)
+          ? (kScalingLookupTableSize + kScalingLookupTablePadding) << 2
+          : kScalingLookupTableSize + kScalingLookupTablePadding;
+
+  bool Init();
+
+  // Allocates noise_stripes_.
+  bool AllocateNoiseStripes();
+
+  bool AllocateNoiseImage();
+
+  void BlendNoiseChromaWorker(const dsp::Dsp& dsp, const Plane* planes,
+                              int num_planes, std::atomic<int>* job_counter,
+                              int min_value, int max_chroma,
+                              const uint8_t* source_plane_y,
+                              ptrdiff_t source_stride_y,
+                              const uint8_t* source_plane_u,
+                              const uint8_t* source_plane_v,
+                              ptrdiff_t source_stride_uv, uint8_t* dest_plane_u,
+                              uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+  void BlendNoiseLumaWorker(const dsp::Dsp& dsp, std::atomic<int>* job_counter,
+                            int min_value, int max_luma,
+                            const uint8_t* source_plane_y,
+                            ptrdiff_t source_stride_y, uint8_t* dest_plane_y,
+                            ptrdiff_t dest_stride_y);
+
+  const FilmGrainParams& params_;
+  const bool is_monochrome_;
+  const bool color_matrix_is_identity_;
+  const int subsampling_x_;
+  const int subsampling_y_;
+  // Frame width and height.
+  const int width_;
+  const int height_;
+  // Section 7.18.3.3, Dimensions of the noise templates for chroma, which are
+  // known as CbGrain and CrGrain.
+  // These templates are used to construct the noise image for each plane by
+  // copying 32x32 blocks with pseudorandom offsets, into "noise stripes."
+  // The noise template known as LumaGrain array is an 82x73 block.
+  // The height and width of the templates for chroma become 44 and 38 under
+  // subsampling, respectively.
+  //  For more details see:
+  // A. Norkin and N. Birkbeck, "Film Grain Synthesis for AV1 Video Codec," 2018
+  // Data Compression Conference, Snowbird, UT, 2018, pp. 3-12.
+  const int template_uv_width_;
+  const int template_uv_height_;
+  // LumaGrain. The luma_grain array contains white noise generated for luma.
+  // The array size is fixed but subject to further optimization for SIMD.
+  GrainType luma_grain_[kLumaHeight * kLumaWidth];
+  // CbGrain and CrGrain. The maximum size of the u_grain and v_grain arrays is
+  // kMaxChromaHeight * kMaxChromaWidth. The actual size is
+  // template_uv_height_ * template_uv_width_.
+  GrainType u_grain_[kMaxChromaHeight * kMaxChromaWidth];
+  GrainType v_grain_[kMaxChromaHeight * kMaxChromaWidth];
+  // Scaling lookup tables.
+  int16_t scaling_lut_y_[kScalingLutLength];
+  int16_t* scaling_lut_u_ = nullptr;
+  int16_t* scaling_lut_v_ = nullptr;
+  // If allocated, this buffer is 256 * 2 values long and scaling_lut_u_ and
+  // scaling_lut_v_ point into this buffer. Otherwise, scaling_lut_u_ and
+  // scaling_lut_v_ point to scaling_lut_y_.
+  std::unique_ptr<int16_t[]> scaling_lut_chroma_buffer_;
+
+  // A two-dimensional array of noise data for each plane. Generated for each 32
+  // luma sample high stripe of the image. The first dimension is called
+  // luma_num. The second dimension is the size of one noise stripe.
+  //
+  // Each row of the Array2DView noise_stripes_[plane] is a conceptually
+  // two-dimensional array of |GrainType|s. The two-dimensional array of
+  // |GrainType|s is flattened into a one-dimensional buffer in this
+  // implementation.
+  //
+  // noise_stripes_[kPlaneY][luma_num] is an array that has 34 rows and
+  // |width_| columns and contains noise for the luma component.
+  //
+  // noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num]
+  // is an array that has (34 >> subsampling_y_) rows and
+  // SubsampledValue(width_, subsampling_x_) columns and contains noise for the
+  // chroma components.
+  Array2DView<GrainType> noise_stripes_[kMaxPlanes];
+  // Owns the memory that the elements of noise_stripes_ point to.
+  std::unique_ptr<GrainType[]> noise_buffer_;
+
+  Array2D<GrainType> noise_image_[kMaxPlanes];
+  ThreadPool* const thread_pool_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FILM_GRAIN_H_
diff --git a/src/film_grain_test.cc b/src/film_grain_test.cc
new file mode 100644 (file)
index 0000000..fc1f1b1
--- /dev/null
@@ -0,0 +1,2690 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/film_grain.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+constexpr int kNumSpeedTests = 50;
+constexpr int kNumFilmGrainTestParams = 10;
+constexpr size_t kLumaBlockSize = kLumaWidth * kLumaHeight;
+constexpr size_t kChromaBlockSize = kMaxChromaWidth * kMaxChromaHeight;
+// Dimensions for unit tests concerning applying grain to the whole frame.
+constexpr size_t kNumTestStripes = 64;
+constexpr int kNoiseStripeHeight = 34;
+constexpr size_t kFrameWidth = 1921;
+constexpr size_t kFrameHeight = (kNumTestStripes - 1) * 32 + 1;
+
+/*
+  The film grain parameters for 10 frames were generated with the following
+  command line:
+  aomenc --end-usage=q --cq-level=20 --cpu-used=8 -w 1920 -h 1080 \
+    --denoise-noise-level=50 --ivf breaking_bad_21m23s_10frames.1920_1080.yuv \
+    -o breaking_bad_21m23s_10frames.1920_1080.noise50.ivf
+*/
+constexpr FilmGrainParams kFilmGrainParams[10] = {
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/8,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 91, 99, 98, 100, 100, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 27, 40, 54, 67, 94, 255, 0, 0},
+     /*point_u_scaling=*/{37, 37, 43, 48, 48, 50, 51, 51, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 107, 255, 0, 0},
+     /*point_v_scaling=*/{48, 48, 43, 33, 32, 33, 34, 34, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3, -2, 1,   -4,
+                                  5,   -1,  -25, -13, 3, -1, 0,   7,
+                                  -20, 103, 26,  -2,  1, 14, -49, 117},
+     /*auto_regression_coeff_u=*/{-2,  1,  -3, 4,   -4, 0,  3,   5,  -5,
+                                  -17, 17, 0,  -10, -5, -3, -30, 14, 70,
+                                  29,  9,  -2, -10, 50, 71, -11},
+     /*auto_regression_coeff_v=*/{3,   -2, -7, 6,   -7, -8, 3,   1,  -12,
+                                  -15, 28, 5,  -11, -2, -7, -27, 32, 62,
+                                  31,  18, -2, -6,  61, 43, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/7391,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/8,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 91, 99, 97, 100, 102, 102, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 134, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 50, 49, 51, 53, 53, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{50, 50, 45, 34, 33, 35, 37, 37, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3,  -1, 1,   -3,
+                                  3,   1,   -27, -12, 2,  -1, 1,   7,
+                                  -17, 100, 27,  0,   -1, 13, -50, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 2,   5,  -3,
+                                  -16, 16, -2, -10, -2, -1, -31, 14, 70,
+                                  29,  9,  -1, -10, 47, 70, -11},
+     /*auto_regression_coeff_v=*/{1,   0,  -5, 5,   -6, -6, 2,   1,  -10,
+                                  -14, 26, 4,  -10, -3, -5, -26, 29, 63,
+                                  31,  17, -1, -6,  55, 47, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/10772,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/8,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 91, 99, 98, 101, 103, 103, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 81, 107, 255, 0, 0, 0},
+     /*point_u_scaling=*/{37, 37, 49, 48, 51, 52, 52, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{49, 49, 44, 34, 32, 34, 36, 36, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -2,  -2,  10,  3, -1, 1,   -4,
+                                  4,   1,   -26, -12, 2, -1, 1,   7,
+                                  -18, 101, 26,  -1,  0, 13, -49, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -3, 4,   -3, -1, 2,   5,  -4,
+                                  -16, 17, -2, -10, -3, -2, -31, 15, 70,
+                                  28,  9,  -1, -10, 48, 70, -11},
+     /*auto_regression_coeff_v=*/{1,   -1, -6, 5,   -6, -7, 2,   2,  -11,
+                                  -14, 27, 5,  -11, -3, -6, -26, 30, 62,
+                                  30,  18, -2, -6,  58, 45, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/14153,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/5,
+     /*num_v_points=*/7,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 90, 99, 98, 100, 100, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 107, 255, 0, 0, 0, 0, 0},
+     /*point_u_scaling=*/{37, 37, 48, 51, 51, 0, 0, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+     /*point_v_scaling=*/{49, 49, 43, 33, 32, 34, 34, 0, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3, -1, 1,   -4,
+                                  6,   0,   -26, -13, 3, -1, 1,   6,
+                                  -20, 103, 26,  -2,  1, 13, -48, 117},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 4,   -4, -1, 2,   5,  -5,
+                                  -16, 18, -1, -10, -3, -2, -30, 16, 69,
+                                  28,  9,  -2, -10, 50, 68, -11},
+     /*auto_regression_coeff_v=*/{2,   -1, -6, 5,   -6, -7, 2,   2,  -11,
+                                  -15, 29, 4,  -10, -3, -6, -26, 30, 62,
+                                  31,  18, -3, -6,  59, 45, 3},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/17534,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/8,
+     /*num_u_points=*/7,
+     /*num_v_points=*/7,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 91, 99, 98, 101, 103, 103, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 81, 107, 255, 0, 0, 0},
+     /*point_u_scaling=*/{37, 37, 49, 49, 52, 53, 53, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+     /*point_v_scaling=*/{50, 50, 44, 34, 33, 36, 37, 0, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3, -1, 1,   -4,
+                                  3,   1,   -26, -12, 2, -1, 1,   7,
+                                  -17, 101, 26,  0,   0, 13, -50, 116},
+     /*auto_regression_coeff_u=*/{-2,  1,  -2, 3,   -3, -1, 2,   5,  -4,
+                                  -16, 16, -2, -10, -3, -1, -31, 14, 70,
+                                  28,  9,  -1, -10, 48, 70, -11},
+     /*auto_regression_coeff_v=*/{1,   0,  -5, 5,   -6, -6, 2,   2,  -10,
+                                  -14, 26, 4,  -10, -3, -5, -26, 29, 63,
+                                  30,  17, -1, -6,  56, 47, 3},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/20915,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/7,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 134, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 91, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 107, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 51, 50, 52, 53, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+     /*point_v_scaling=*/{51, 51, 45, 35, 33, 36, 36, 0, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,   -27, -12, 2,  0,  1,   7,
+                                  -16, 100, 27,  0,   -1, 13, -51, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   4,  -2,
+                                  -17, 14, -3, -10, -2, 0,  -31, 14, 71,
+                                  29,  8,  -2, -10, 45, 71, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 4,   -6, -5, 2,   1,  -9,
+                                  -14, 24, 3,  -10, -3, -4, -25, 29, 63,
+                                  31,  16, -1, -7,  54, 48, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/24296,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 134, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 91, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 134, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 50, 50, 51, 53, 53, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{50, 50, 45, 34, 33, 35, 36, 36, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3,  -1, 1,   -3,
+                                  3,   2,   -27, -12, 2,  0,  1,   7,
+                                  -17, 100, 27,  0,   -1, 13, -51, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   5,  -3,
+                                  -16, 15, -2, -10, -2, -1, -31, 14, 70,
+                                  29,  8,  -1, -10, 46, 71, -11},
+     /*auto_regression_coeff_v=*/{1,   0,  -5, 5,   -6, -5, 2,   1,  -9,
+                                  -14, 25, 4,  -10, -3, -5, -25, 29, 63,
+                                  31,  17, -1, -7,  55, 47, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/27677,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 92, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 174, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 51, 50, 52, 54, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{51, 51, 46, 35, 33, 35, 37, 37, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -1, -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,  -28, -12, 2,  0,  1,   8,
+                                  -16, 99, 27,  0,   -1, 13, -51, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 2,   4,  -2,
+                                  -16, 14, -3, -10, -2, 0,  -31, 13, 71,
+                                  29,  8,  -2, -11, 44, 72, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 4,   -6, -4, 2,   1,  -9,
+                                  -13, 23, 3,  -10, -3, -4, -25, 28, 63,
+                                  32,  16, -1, -7,  54, 49, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/31058,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/9,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 92, 99, 98, 100, 98, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 228, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 51, 51, 52, 54, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 201, 255, 0},
+     /*point_v_scaling=*/{51, 51, 46, 35, 34, 35, 37, 37, 37, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -1, -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,  -28, -12, 2,  0,  1,   8,
+                                  -16, 99, 27,  0,   -1, 13, -52, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   4,  -2,
+                                  -16, 13, -3, -10, -2, 0,  -31, 13, 71,
+                                  29,  8,  -2, -11, 44, 72, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 4,   -6, -4, 2,   2,  -8,
+                                  -13, 23, 3,  -10, -3, -4, -25, 28, 63,
+                                  32,  16, -1, -7,  54, 49, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/34439,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/9,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 92, 99, 98, 99, 95, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 228, 255, 0, 0, 0},
+     /*point_u_scaling=*/{39, 39, 51, 51, 52, 54, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 201, 255, 0},
+     /*point_v_scaling=*/{51, 51, 46, 35, 34, 35, 36, 35, 35, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -1, -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,  -28, -11, 2,  0,  1,   8,
+                                  -16, 99, 27,  0,   -1, 13, -52, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   4,  -2,
+                                  -16, 13, -3, -10, -2, 0,  -30, 13, 71,
+                                  29,  8,  -2, -10, 43, 72, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 3,   -6, -4, 2,   2,  -8,
+                                  -13, 23, 3,  -10, -3, -4, -25, 28, 64,
+                                  32,  16, -1, -7,  53, 49, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/37820,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0}};
+
+const char* GetTestDigestLuma(int bitdepth, int param_index) {
+  static const char* const kTestDigestsLuma8bpp[10] = {
+      "80da8e849110a10c0a73f9dec0d9a2fb", "54352f02aeda541e17a4c2d208897e2b",
+      "2ad9021124c82aca3e7c9517d00d1236", "f6c5f64513925b09ceba31e92511f8a1",
+      "46c6006578c68c3c8619f7a389c7de45", "fcddbd27545254dc50f1c333c8b7e313",
+      "c6d4dc181bf7f2f93ae099b836685151", "2949ef836748271195914fef9acf4e46",
+      "524e79bb87ed550e123d00a61df94381", "182222470d7b7a80017521d0261e4474",
+  };
+  static const char* const kTestDigestsLuma10bpp[10] = {
+      "27a49a2131fb6d4dd4b8c34da1b7642e", "4ea9134f6831dd398545c85b2a68e31f",
+      "4e12232a18a2b06e958d7ab6b953faad", "0ede12864ddaced2d8062ffa4225ce24",
+      "5fee492c4a430b2417a64aa4920b69e9", "39af842a3f9370d796e8ef047c0c42a8",
+      "0efbad5f9dc07391ad243232b8df1787", "2bd41882cd82960019aa2b87d5fb1fbc",
+      "1c66629c0c4e7b6f9b0a7a6944fbad50", "2c633a50ead62f8e844a409545f46244",
+  };
+  static const char* const kTestDigestsLuma12bpp[10] = {
+      "1dc9b38a93454a85eb924f25346ae369", "5f9d311ee5384a5a902f8e2d1297319e",
+      "cf1a35878720564c7a741f91eef66565", "47a0608fe0f6f7ccae42a5ca05783cbf",
+      "dbc28da0178e3c18a036c3f2203c300f", "04911d2074e3252119ee2d80426b8c01",
+      "df19ab8103c40b726c842ccf7772208b", "39276967eb16710d98f82068c3eeba41",
+      "b83100f18abb2062d9c9969f07182b86", "b39a69515491329698cf66f6d4fa371f",
+  };
+
+  switch (bitdepth) {
+    case 8:
+      return kTestDigestsLuma8bpp[param_index];
+    case 10:
+      return kTestDigestsLuma10bpp[param_index];
+    case 12:
+      return kTestDigestsLuma12bpp[param_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetTestDigestChromaU(int bitdepth, int param_index) {
+  static const char* const kTestDigestsChromaU8bpp[10] = {
+      "e56b7bbe9f39bf987770b18aeca59514", "d0b3fd3cf2901dae31b73f20c510d83e",
+      "800c01d58d9fb72136d21ec2bb07899a", "4cd0badba679e8edbcd60a931fce49a1",
+      "cabec236cc17f91f3f08d8cde867aa72", "380a2205cf2d40c6a27152585f61a3b0",
+      "3813526234dc7f90f80f6684772c729a", "97a43a73066d88f9cbd915d56fc9c196",
+      "5b70b27a43dd63b03e23aecd3a935071", "d5cc98685582ffd47a41a97d2e377ac8",
+  };
+  static const char* const kTestDigestsChromaU10bpp[10] = {
+      "9a6d0369ba86317598e65913276dae6d", "2512bdc4c88f21f8185b040b7752d1db",
+      "1e86b779ce6555fcf5bd0ade2af67e73", "5ad463a354ffce522c52b616fb122024",
+      "290d53c22c2143b0882acb887da3fdf1", "54622407d865371d7e70bbf29fdda626",
+      "be306c6a94c55dbd9ef514f0ad4a0011", "904602329b0dec352b3b177b0a2554d2",
+      "58afc9497d968c67fdf2c0cf23b33aa3", "74fee7be6f62724bf901fdd04a733b46",
+  };
+  static const char* const kTestDigestsChromaU12bpp[10] = {
+      "846d608050fe7c19d6cabe2d53cb7821", "2caf4665a26aad50f68497e4b1326417",
+      "ce40f0f8f8c207c7c985464c812fea33", "820de51d07a21da5c00833bab546f1fa",
+      "5e7bedd8933cd274af03babb4dbb94dd", "d137cf584eabea86387460a6d3f62bfe",
+      "f206e0c6ed35b3ab35c6ff37e151e963", "55d87981b7044df225b3b5935185449b",
+      "6a655c8bf4df6af0e80ae6d004a73a25", "6234ae36076cc77161af6e6e3c04449a",
+  };
+
+  switch (bitdepth) {
+    case 8:
+      return kTestDigestsChromaU8bpp[param_index];
+    case 10:
+      return kTestDigestsChromaU10bpp[param_index];
+    case 12:
+      return kTestDigestsChromaU12bpp[param_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetTestDigestChromaV(int bitdepth, int param_index) {
+  static const char* const kTestDigestsChromaV8bpp[10] = {
+      "7205ed6c07ed27b7b52d871e0559b8fa", "fad033b1482dba0ed2d450b461fa310e",
+      "6bb39798ec6a0f7bda0b0fcb0a555734", "08c19856e10123ae520ccfc63e2fbe7b",
+      "a7695a6b69fba740a50310dfa6cf1c00", "ac2eac2d13fc5b21c4f2995d5abe14b9",
+      "be35cb30062db628a9e1304fca8b75dc", "f5bfc7a910c76bcd5b32c40772170879",
+      "aca07b37d63f978d76df5cd75d0cea5e", "107c7c56d4ec21f346a1a02206301b0d",
+  };
+  static const char* const kTestDigestsChromaV10bpp[10] = {
+      "910724a77710996c90e272f1c1e9ff8e", "d293f861580770a89f1e266931a012ad",
+      "9e4f0c85fb533e51238586f9c3e68b6e", "a5ff4478d9eeb2168262c2e955e17a4f",
+      "fba6b1e8f28e4e90c836d41f28a0c154", "50b9a93f9a1f3845e6903bff9270a3e6",
+      "7b1624c3543badf5fadaee4d1e602e6b", "3be074e4ca0eec5770748b15661aaadd",
+      "639197401032f272d6c30666a2d08f43", "28075dd34246bf9d5e6197b1944f646a",
+  };
+  static const char* const kTestDigestsChromaV12bpp[10] = {
+      "4957ec919c20707d594fa5c2138c2550", "3f07c65bfb42c81768b1f5ad9611d1ce",
+      "665d9547171c99faba95ac81a35c9a0c", "1b5d032e0cefdb4041ad51796de8a45e",
+      "18fa974579a4f1ff8cd7df664fc339d5", "2ffaa4f143495ff73c06a580a97b6321",
+      "4fd1f562bc47a68dbfaf7c566c7c4da6", "4d37c80c9caf110c1d3d20bd1a1875b3",
+      "8ea29759640962613166dc5154837d14", "5ca4c10f42d0906c72ebee90fae6ce7d",
+  };
+
+  switch (bitdepth) {
+    case 8:
+      return kTestDigestsChromaV8bpp[param_index];
+    case 10:
+      return kTestDigestsChromaV10bpp[param_index];
+    case 12:
+      return kTestDigestsChromaV12bpp[param_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetARTestDigestLuma(int bitdepth, int coeff_lag, int param_index) {
+  static const char* const kTestDigestsLuma8bpp[3][kNumFilmGrainTestParams] = {
+      {"a835127918f93478b45f1ba4d20d81bd", "a835127918f93478b45f1ba4d20d81bd",
+       "e5db4da626e214bb17bcc7ecffa76303", "a835127918f93478b45f1ba4d20d81bd",
+       "a835127918f93478b45f1ba4d20d81bd", "e5db4da626e214bb17bcc7ecffa76303",
+       "a835127918f93478b45f1ba4d20d81bd", "1da62b7233de502123a18546b6c97da2",
+       "1da62b7233de502123a18546b6c97da2", "1da62b7233de502123a18546b6c97da2"},
+      {"11464b880de3ecd6e6189c5c4e7f9b28", "dfe411762e283b5f49bece02ec200951",
+       "5c534d92afdf0a5b53dbe4fe7271929c", "2e1a68a18aca96c31320ba7ceab59be9",
+       "584c0323e6b276cb9acb1a294d462d58", "9571eb8f1cbaa96ea3bf64a820a8d9f0",
+       "305285ff0df87aba3c59e3fc0818697d", "0066d35c8818cf20230114dcd3765a4d",
+       "0066d35c8818cf20230114dcd3765a4d", "16d61b046084ef2636eedc5a737cb6f6"},
+      {"0c9e2cf1b6c3cad0f7668026e8ea0516", "7d094855292d0eded9e0d1b5bab1990b",
+       "fbf28860a5f1285dcc6725a45256a86a", "dccb906904160ccabbd2c9a7797a4bf9",
+       "46f645e17f08a3260b1ae70284e5c5b8", "124fdc90bed11a7320a0cbdee8b94400",
+       "8d2978651dddeaef6282191fa146f0a0", "28b4d5aa33f05b3fb7f9323a11936bdc",
+       "6a8ea684f6736a069e3612d1af6391a8", "2781ea40a63704dbfeb3a1ac5db6f2fc"},
+  };
+
+  static const char* const kTestDigestsLuma10bpp[3][kNumFilmGrainTestParams] = {
+      {"5e6bc8444ece2d38420f51d82238d812", "5e6bc8444ece2d38420f51d82238d812",
+       "2bfaec768794af33d60a9771f971f68d", "5e6bc8444ece2d38420f51d82238d812",
+       "5e6bc8444ece2d38420f51d82238d812", "c880807a368c4e82c23bea6f035ad23f",
+       "5e6bc8444ece2d38420f51d82238d812", "c576667da5286183ec3aab9a76f53a2e",
+       "c576667da5286183ec3aab9a76f53a2e", "c576667da5286183ec3aab9a76f53a2e"},
+      {"095c2dd4d4d52aff9696df9bfdb70062", "983d14afa497060792d472a449a380c7",
+       "c5fdc0f7c594b2b36132cec6f45a79bd", "acff232ac5597c1712213150552281d1",
+       "4dd7341923b1d260092853553b6b6246", "0ca8afd71a4f564ea1ce69c4af14e9ab",
+       "9bc7565e5359d09194fcee28e4bf7b94", "6fea7805458b9d149f238a30e2dc3f13",
+       "6fea7805458b9d149f238a30e2dc3f13", "681dff5fc7a7244ba4e4a582ca7ecb14"},
+      {"cb99352c9c6300e7e825188bb4adaee0", "7e40674de0209bd72f8e9c6e39ee6f7c",
+       "3e475572f6b4ecbb2730fd16751ad7ed", "e6e4c63abc9cb112d9d1f23886cd1415",
+       "1a1c953b175c105c604902877e2bab18", "380a53072530223d4ee622e014ee4bdb",
+       "6137394ea1172fb7ea0cbac237ff1703", "85ab0c813e46f97cb9f42542f44c01ad",
+       "68c8ac462f0e28cb35402c538bee32f1", "0038502ffa4760c8feb6f9abd4de7250"},
+  };
+
+  static const char* const kTestDigestsLuma12bpp[3][kNumFilmGrainTestParams] = {
+      {"d618bbb0e337969c91b1805f39561520", "d618bbb0e337969c91b1805f39561520",
+       "678f6e911591daf9eca4e305dabdb2b3", "d618bbb0e337969c91b1805f39561520",
+       "d618bbb0e337969c91b1805f39561520", "3b26f49612fd587c7360790d40adb5de",
+       "d618bbb0e337969c91b1805f39561520", "33f77d3ff50cfc64c6bc9a896b567377",
+       "33f77d3ff50cfc64c6bc9a896b567377", "33f77d3ff50cfc64c6bc9a896b567377"},
+      {"362fd67050fb7abaf57c43a92d993423", "e014ae0eb9e697281015c38905cc46ef",
+       "82b867e57151dc08afba31eccf5ccf69", "a94ba736cdce7bfa0b550285f59e47a9",
+       "3f1b0b7dd3b10e322254d35e4e185b7c", "7929708e5f017d58c53513cb79b35fda",
+       "6d26d31a091cbe642a7070933bd7de5a", "dc29ac40a994c0a760bfbad0bfc15b3a",
+       "dc29ac40a994c0a760bfbad0bfc15b3a", "399b919db5190a5311ce8d166580827b"},
+      {"6116d1f569f5b568eca4dc1fbf255086", "7e9cf31ea74e8ea99ffd12094ce6cd05",
+       "bb982c4c39e82a333d744defd16f4388", "7c6e584b082dc6b97ed0d967def3993f",
+       "fb234695353058f03c8e128f2f8de130", "9218c6ca67bf6a9237f98aa1ce7acdfd",
+       "d1fb834bbb388ed066c5cbc1c79b5bdf", "d6f630daedc08216fcea12012e7408b5",
+       "dd7fe49299e6f113a98debc7411c8db8", "8b89e45a5101a28c24209ae119eafeb8"},
+  };
+
+  switch (bitdepth) {
+    case 8:
+      return kTestDigestsLuma8bpp[coeff_lag - 1][param_index];
+    case 10:
+      return kTestDigestsLuma10bpp[coeff_lag - 1][param_index];
+    case 12:
+      return kTestDigestsLuma12bpp[coeff_lag - 1][param_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetARTestDigestChromaU(int bitdepth, int coeff_lag,
+                                   int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigestsChromaU8bpp[12] = {
+      "11ced66de0eaf55c1ff9bad18d7b8ed7", "0c3b77345dd4ab0915ef53693ab93ce4",
+      "b0645044ba080b3ceb8f299e269377d6", "50590ad5d895f0b4bc6694d878e9cd32",
+      "85e1bf3741100135062f5b4abfe7639b", "76955b70dde61ca5c7d079c501b90906",
+      "3f0995e1397fd9efd9fc46b67f7796b3", "0a0d6c3e4e1649eb101395bc97943a07",
+      "1878855ed8db600ccae1d39abac52ec6", "13ab2b28320ed3ac2b820f08fdfd424d",
+      "f3e95544a86ead5387e3dc4e043fd0f0", "ff8f5d2d97a6689e16a7e4f482f69f0b",
+  };
+
+  static const char* const kTestDigestsChromaU10bpp[12] = {
+      "707f2aa5aa7e77bc6e83ab08287d748d", "0bcf40c7fead9ac3a5d71b4cc1e21549",
+      "0c1df27053e5da7cf1276a122a8f4e8b", "782962f7425eb38923a4f87e7ab319d9",
+      "b4a709ae5967afef55530b9ea8ef0062", "70a971a0b9bf06212d510b396f0f9095",
+      "d033b89d6e31f8b13c83d94c840b7d54", "40bbe804bf3f90cee667d3b275e3c964",
+      "90bb2b9d518b945adcfd1b1807f7d170", "4bc34aa157fe5ad4270c611afa75e878",
+      "e2688d7286cd43fe0a3ea734d2ad0f77", "853193c4981bd882912171061327bdf2",
+  };
+
+  static const char* const kTestDigestsChromaU12bpp[12] = {
+      "04c23b01d01c0e3f3247f3741581b383", "9f8ea1d66e44f6fe93d765ce56b2b0f3",
+      "5dda44b128d6c244963f1e8e17cc1d22", "9dd0a79dd2f772310a95762d445bface",
+      "0dbd40d930e4873d72ea72b9e3d62440", "d7d83c207c6b435a164206d5f457931f",
+      "e8d04f6e63ed63838adff965275a1ff1", "fc09a903e941fcff8bad67a84f705775",
+      "9cd706606a2aa40d0957547756f7abd9", "258b37e7b8f48db77dac7ea24073fe69",
+      "80149b8bb05308da09c1383d8b79d3da", "e993f3bffae53204a1942feb1af42074",
+  };
+
+  assert(!(subsampling_x == 0 && subsampling_y == 1));
+  const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
+  switch (bitdepth) {
+    case 8:
+      return kTestDigestsChromaU8bpp[base_index];
+    case 10:
+      return kTestDigestsChromaU10bpp[base_index];
+    case 12:
+      return kTestDigestsChromaU12bpp[base_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetARTestDigestChromaV(int bitdepth, int coeff_lag,
+                                   int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigestsChromaV8bpp[12] = {
+      "5c2179f3d93be0a0da75d2bb90347c2f", "79b883847d7eaa7890e1d633b8e34353",
+      "90ade818e55808e8cf58c11debb5ddd1", "1d0f2a14bc4df2b2a1abaf8137029f92",
+      "ac753a57ade140dccb50c14f941ae1fc", "d24ab497558f6896f08dc17bcc3c50c1",
+      "3d74436c63920022a95c85b234db4e33", "061c2d53ed84c830f454e395c362cb16",
+      "05d24869d7fb952e332457a114c8b9b7", "fcee31b87a2ada8028c2a975e094856a",
+      "c019e2c475737abcf9c2b2a52845c646", "9cd994baa7021f8bdf1d1c468c1c8e9c",
+  };
+
+  static const char* const kTestDigestsChromaV10bpp[12] = {
+      "bc9e44454a05cac8571c15af5b720e79", "f0374436698d94e879c03331b1f30df4",
+      "4580dd009abd6eeed59485057c55f63e", "7d1f7aecd45302bb461f4467f2770f72",
+      "1f0d003fce6c5fedc147c6112813f43b", "4771a45c2c1a04c375400619d5536035",
+      "df9cf619a78907c0f6e58bc13d7d5546", "dd3715ce65d905f30070a36977c818e0",
+      "32de5800f76e34c128a1d89146b4010b", "db9d7c70c3f69feb68fae04398efc773",
+      "d3d0912e3fdb956fef416a010bd7b4c2", "a2fca8abd9fd38d2eef3c4495d9eff78",
+  };
+
+  static const char* const kTestDigestsChromaV12bpp[12] = {
+      "0d1890335f4464167de22353678ca9c6", "9e6830aba73139407196f1c811f910bc",
+      "6018f2fb76bd648bef0262471cfeba5c", "78e1ae1b790d709cdb8997621cf0fde3",
+      "5b44ae281d7f9db2f17aa3c24b4741dd", "f931d16991669cb16721de87da9b8067",
+      "5580f2aed349d9cabdafb9fc25a57b1c", "86918cd78bf95e6d4405dd050f5890b8",
+      "13c8b314eeebe35fa60b703d94e1b2c1", "13c6fb75cab3f42e0d4ca31e4d068b0e",
+      "bb9ca0bd6f8cd67e44c8ac2803abf5a5", "0da4ea711ffe557bb66577392b6f148b",
+  };
+
+  assert(!(subsampling_x == 0 && subsampling_y == 1));
+  const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
+  switch (bitdepth) {
+    case 8:
+      return kTestDigestsChromaV8bpp[base_index];
+    case 10:
+      return kTestDigestsChromaV10bpp[base_index];
+    case 12:
+      return kTestDigestsChromaV12bpp[base_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetGrainGenerationTestDigestLuma(int bitdepth, int param_index) {
+  static const char* const kTestDigestsLuma8bpp[kNumFilmGrainTestParams] = {
+      "c48babd99e5cfcbaa13d8b6e0c12e644", "da4b971d2de19b709e2bc98d2e50caf3",
+      "96c72faac19a79c138afeea8b8ae8c7a", "90a2b9c8304a44d14e83ca51bfd2fe8a",
+      "72bd3aa85c17850acb430afb4183bf1a", "a0acf76349b9efbc9181fc31153d9ef6",
+      "6da74dd631a4ec8b9372c0bbec22e246", "6e11fa230f0e5fbb13084255c22cabf9",
+      "be1d257b762f9880d81680e9325932a2", "37e302075af8130b371de4430e8a22cf",
+  };
+
+  static const char* const kTestDigestsLuma10bpp[kNumFilmGrainTestParams] = {
+      "0a40fd2f261095a6154584a531328142", "9d0c8173a94a0514c769e94b6f254030",
+      "7894e959fdd5545895412e1512c9352d", "6802cad2748cf6db7f66f53807ee46ab",
+      "ea24e962b98351c3d929a8ae41e320e2", "b333dc944274a3a094073889ca6e11d6",
+      "7211d7ac0ff7d11b5ef1538c0d98f43d", "ef9f9cbc101a07da7bfa62637130e331",
+      "85a122e32648fde84b883a1f98947c60", "dee656e3791138285bc5b71e3491a177",
+  };
+
+  static const char* const kTestDigestsLuma12bpp[kNumFilmGrainTestParams] = {
+      "ae359794b5340d073d597117046886ac", "4d4ad3908b4fb0f248a0086537dd6b1e",
+      "672a97e15180cbeeaf76d763992c9f23", "739124d10d16e00a158e833ea92107bc",
+      "4c38c738ff7ffc50adaa4474584d3aae", "ca05ba7e51000a7d10e5cbb2101bbd86",
+      "e207022b916bf03a76ac8742af29853d", "7454bf1859149237ff74f1161156c857",
+      "10fc2a16e663bbc305255b0883cfcd45", "4228abff6899bb33839b579288ab29fe",
+  };
+
+  switch (bitdepth) {
+    case 8:
+      return kTestDigestsLuma8bpp[param_index];
+    case 10:
+      return kTestDigestsLuma10bpp[param_index];
+    case 12:
+      return kTestDigestsLuma12bpp[param_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetConstructStripesTestDigest(int bitdepth, int overlap_flag,
+                                          int subsampling_x,
+                                          int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "cd14aaa6fc1728290fa75772730a2155", "13ad4551feadccc3a3a9bd5e25878d2a",
+      "ed6ad9532c96ef0d79ff3228c89a429f", "82f307a7f5fc3308c3ebe268b5169e70",
+      "aed793d525b85349a8c2eb6d40e93969", "311c3deb727621a7d4f18e8defb65de7",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "4fe2fa1e428737de3595be3a097d0203", "80568c3c3b53bdbbd03b820179092dcd",
+      "bc7b73099961a0739c36e027d6d09ea1", "e5331364e5146a6327fd94e1467f59a3",
+      "125bf18b7787e8f0792ea12f9210de0d", "21cf98cbce17eca77dc150cc9be0e0a0",
+  };
+
+  static const char* const kTestDigests12bpp[6] = {
+      "57f8e17078b6e8935252e918a2562636", "556a7b294a99bf1163b7166b4f68357e",
+      "249bee5572cd7d1cc07182c97adc4ba7", "9bf43ae1998c2a5b2e5f4d8236b58747",
+      "477c08fa26499936e5bb03bde097633e", "fe64b7166ff87ea0711ae4f519cadd59",
+  };
+
+  const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
+  switch (bitdepth) {
+    case 8:
+      return kTestDigests8bpp[base_index];
+    case 10:
+      return kTestDigests10bpp[base_index];
+    case 12:
+      return kTestDigests12bpp[base_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetConstructImageTestDigest(int bitdepth, int overlap_flag,
+                                        int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "17030fc692e685557a3717f9334af7e8", "d16ea46147183cd7bc36bcfc2f936a5b",
+      "68152958540dbec885f71e3bcd7aa088", "bb43b420f05a122eb4780aca06055ab1",
+      "87567b04fbdf64f391258c0742de266b", "ce87d556048b3de32570faf6729f4010",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "5b31b29a5e22126a9bf8cd6a01645777", "2bb94a25164117f2ab18dae18e2c6577",
+      "27e57a4ed6f0c9fe0a763a03f44805e8", "481642ab0b07437b76b169aa4eb82123",
+      "656a9ef056b04565bec9ca7e0873c408", "a70fff81ab28d02d99dd4f142699ba39",
+  };
+
+  static const char* const kTestDigests12bpp[6] = {
+      "146f7ceadaf77e7a3c41e191a58c1d3c", "de18526db39630936733e687cdca189e",
+      "165c96ff63bf3136505ab1d239f7ceae", "a102636662547f84e5f6fb6c3e4ef959",
+      "4cb073fcc783c158a95c0b1ce0d27e9f", "3a734c71d4325a7da53e2a6e00f81647",
+  };
+
+  const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
+  switch (bitdepth) {
+    case 8:
+      return kTestDigests8bpp[base_index];
+    case 10:
+      return kTestDigests10bpp[base_index];
+    case 12:
+      return kTestDigests12bpp[base_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetScalingInitTestDigest(int param_index, int bitdepth) {
+  static const char* const kTestDigests8bpp[kNumFilmGrainTestParams] = {
+      "315202ca3bf9c46eac8605e89baffd2a", "640f6408702b07ab7e832e7326cce56f",
+      "f75ee83e3912a3f25949e852d67326cf", "211223f5d6a4b42a8e3c662f921b71c0",
+      "f75ee83e3912a3f25949e852d67326cf", "e7a1de8c5a2cac2145c586ecf1f9051c",
+      "e7a1de8c5a2cac2145c586ecf1f9051c", "276fe5e3b30b2db2a9ff798eb6cb8e00",
+      "ac67f1c3aff2f50ed4b1975bde67ffe3", "8db6145a60d506cc94f07cef8b27c681",
+  };
+
+  static const char* const kTestDigests10bpp[kNumFilmGrainTestParams] = {
+      "c50be59c62b634ff45ddfbe5b978adfc", "7626286109a2a1eaf0a26f6b2bbab9aa",
+      "f2302988140c47a0724fc55ff523b6ec", "5318e33d8a59a526347ffa6a72ba6ebd",
+      "f2302988140c47a0724fc55ff523b6ec", "f435b5fe98e9d8b6c61fa6f457601c2c",
+      "f435b5fe98e9d8b6c61fa6f457601c2c", "ff07a2944dbe094d01e199098764941c",
+      "11b3e256c74cee2b5679f7457793869a", "89fab5c1db09e242d0494d1c696a774a",
+  };
+
+  static const char* const kTestDigests12bpp[kNumFilmGrainTestParams] = {
+      "1554df49a863a851d146213e09d311a4", "84808c3ed3b5495a62c9d2dd9a08cb26",
+      "bb31f083a3bd9ef26587478b8752f280", "34fdfe61d6871e4882e38062a0725c5c",
+      "bb31f083a3bd9ef26587478b8752f280", "e7b8c3e4508ceabe89b78f10a9e160b8",
+      "e7b8c3e4508ceabe89b78f10a9e160b8", "a0ccc9e3d0f0c9d1f08f1249264d92f5",
+      "7992a96883c8a9a35d6ca8961bc4515b", "de906ce2c0fceed6f168215447b21b16",
+  };
+
+  switch (bitdepth) {
+    case 8:
+      return kTestDigests8bpp[param_index];
+    case 10:
+      return kTestDigests10bpp[param_index];
+    case 12:
+      return kTestDigests12bpp[param_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetBlendLumaTestDigest(int bitdepth) {
+  static const char* const kTestDigests[] = {
+      "de35b16c702690b1d311cdd0973835d7",
+      "60e9f24dcaaa0207a8db5ab5f3c66608",
+      "8e7d44b620bb7768459074be6bfbca7b",
+  };
+
+  assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+  return kTestDigests[(bitdepth - 8) / 2];
+}
+
+const char* GetBlendChromaUTestDigest(int bitdepth,
+                                      int chroma_scaling_from_luma,
+                                      int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "36ca194734d45e75079baba1f3ec9e9e", "182b388061f59fd3e24ef4581c536e67",
+      "2e7843b4c624f03316c3cbe1cc835859", "39e6d9606915da6a41168fbb006b55e4",
+      "3f44a4e252d4823544ac66a900dc7983", "1860f0831841f262d66b23f6a6b5833b",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "2054665564f55750c9588b505eb01ac0", "4d8b0e248f8a6bfc72516aa164e76b0b",
+      "7e549800a4f9fff6833bb7738e272baf", "8de6f30dcda99a37b359fd815e62d2f7",
+      "9b7958a2278a16bce2b7bc31fdd811f5", "c5c3c8cccf6a2b4e40b4a412a5bf4f08",
+  };
+
+  static const char* const kTestDigests12bpp[6] = {
+      "8fad0cc641da35e0d2d8f178c7ce8394", "793eb9d2e6b4ea2e3bb08e7068236155",
+      "9156bd85ab9493d8867a174f920bb1e6", "6834319b4c88e3e0c96b6f8d7efd08dd",
+      "c40e492790d3803a734efbc6feca46e2", "d884c3b1e2c21d98844ca7639e0599a5",
+  };
+
+  const int base_index =
+      3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
+  switch (bitdepth) {
+    case 8:
+      return kTestDigests8bpp[base_index];
+    case 10:
+      return kTestDigests10bpp[base_index];
+    case 12:
+      return kTestDigests12bpp[base_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+const char* GetBlendChromaVTestDigest(int bitdepth,
+                                      int chroma_scaling_from_luma,
+                                      int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "9a353e4f86d7ebaa980f7f6cfc0995ad", "17589b4039ed49ba16f32db9fae724b7",
+      "76ae8bed48a173b548993b6e1824ff67", "c1458ac9bdfbf0b4d6a175343b17b27b",
+      "fa76d1c8e48957537f26af6a5b54ec14", "313fe3c34568b7f9c5ecb09d419d4ba4",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "8ab5a8e03f07547260033d6a0b689e3c", "275ede58d311e2f5fd76f222f45a64fc",
+      "ce13916e0f7b02087fd0356534d32770", "165bfc8cda0266936a67fa4ec9b215cb",
+      "ed4382caa936acf1158ff8049d18ffac", "942bdd1344c9182dd7572099fb9372db",
+  };
+
+  static const char* const kTestDigests12bpp[6] = {
+      "70704a1e171a3a70d40b7d0037a75fbc", "62549e2afbf36a1ed405a6574d39c542",
+      "e93889927ab77c6e0767ff071d980c02", "a0c1f6ed78874137710fee7418d80959",
+      "f6283e36a25cb867e30bdf0bfdb2124b", "741c2d48898835b9d9e3bd0b6ac6269a",
+  };
+
+  const int base_index =
+      3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
+  switch (bitdepth) {
+    case 8:
+      return kTestDigests8bpp[base_index];
+    case 10:
+      return kTestDigests10bpp[base_index];
+    case 12:
+      return kTestDigests12bpp[base_index];
+    default:
+      assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+      return nullptr;
+  }
+}
+
+// GetFilmGrainRandomNumber() is only invoked with |bits| equal to 11 or 8. Test
+// both values of |bits|.
+TEST(FilmGrainTest, GetFilmGrainRandomNumber) {
+  uint16_t seed = 51968;
+  const struct {
+    int rand;
+    uint16_t seed;
+  } kExpected11[5] = {
+      {812, 25984}, {406, 12992}, {1227, 39264}, {1637, 52400}, {818, 26200},
+  };
+  for (int i = 0; i < 5; ++i) {
+    int rand = GetFilmGrainRandomNumber(11, &seed);
+    EXPECT_EQ(rand, kExpected11[i].rand) << "i = " << i;
+    EXPECT_EQ(seed, kExpected11[i].seed) << "i = " << i;
+  }
+  const struct {
+    int rand;
+    uint16_t seed;
+  } kExpected8[5] = {
+      {179, 45868}, {89, 22934}, {44, 11467}, {150, 38501}, {75, 19250},
+  };
+  for (int i = 0; i < 5; ++i) {
+    int rand = GetFilmGrainRandomNumber(8, &seed);
+    EXPECT_EQ(rand, kExpected8[i].rand) << "i = " << i;
+    EXPECT_EQ(seed, kExpected8[i].seed) << "i = " << i;
+  }
+}
+
+// In FilmGrainParams, if num_u_points and num_v_points are both 0 and
+// chroma_scaling_from_luma is false, GenerateChromaGrains() should set both
+// the u_grain and v_grain arrays to all zeros.
+TEST(FilmGrainTest, GenerateZeroChromaGrains) {
+  FilmGrainParams film_grain_params = {};
+  film_grain_params.apply_grain = true;
+  film_grain_params.update_grain = true;
+  film_grain_params.chroma_scaling = 8;
+  film_grain_params.auto_regression_shift = 6;
+  film_grain_params.grain_seed = 51968;
+
+  int8_t u_grain[73 * 82];
+  int8_t v_grain[73 * 82];
+  const int chroma_width = 44;
+  const int chroma_height = 38;
+
+  // Initialize u_grain and v_grain with arbitrary nonzero values.
+  memset(u_grain, 1, sizeof(u_grain));
+  memset(v_grain, 2, sizeof(v_grain));
+  for (int y = 0; y < chroma_height; ++y) {
+    for (int x = 0; x < chroma_width; ++x) {
+      EXPECT_NE(u_grain[y * chroma_width + x], 0);
+      EXPECT_NE(v_grain[y * chroma_width + x], 0);
+    }
+  }
+
+  FilmGrain<8>::GenerateChromaGrains(film_grain_params, chroma_width,
+                                     chroma_height, u_grain, v_grain);
+
+  for (int y = 0; y < chroma_height; ++y) {
+    for (int x = 0; x < chroma_width; ++x) {
+      EXPECT_EQ(u_grain[y * chroma_width + x], 0);
+      EXPECT_EQ(v_grain[y * chroma_width + x], 0);
+    }
+  }
+}
+
+// First parameter is coefficient lag. Second parameter is the index into
+// |kFilmGrainParams|.
+template <int bitdepth>
+class AutoRegressionTestLuma
+    : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  AutoRegressionTestLuma() {
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    const int index = std::get<0>(GetParam()) - 1;
+    base_luma_auto_regression_func_ =
+        dsp->film_grain.luma_auto_regression[index];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_luma_auto_regression_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    luma_auto_regression_func_ = dsp->film_grain.luma_auto_regression[index];
+  }
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestAutoRegressiveFilterLuma(int coeff_lag, int param_index,
+                                    int num_runs, bool saturate, bool compare);
+  LumaAutoRegressionFunc luma_auto_regression_func_;
+  LumaAutoRegressionFunc base_luma_auto_regression_func_;
+  GrainType luma_block_buffer_[kLumaBlockSize];
+  GrainType base_luma_block_buffer_[kLumaBlockSize];
+};
+
+// First parameter is coefficient lag. Second parameter is the index into
+// |kFilmGrainParams|.
+template <int bitdepth>
+void AutoRegressionTestLuma<bitdepth>::TestAutoRegressiveFilterLuma(
+    int coeff_lag, int param_index, int num_runs, bool saturate, bool compare) {
+  if (luma_auto_regression_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_luma_auto_regression_func_ == nullptr && compare) return;
+  FilmGrainParams params = kFilmGrainParams[param_index];
+  params.auto_regression_coeff_lag = coeff_lag;
+  const int grain_max = GetGrainMax<bitdepth>();
+  for (int y = 0; y < kLumaHeight; ++y) {
+    for (int x = 0; x < kLumaWidth; ++x) {
+      if (saturate) {
+        luma_block_buffer_[y * kLumaWidth + x] = grain_max;
+      } else {
+        luma_block_buffer_[y * kLumaWidth + x] =
+            std::min(x - (kLumaWidth >> 1), y - (kLumaHeight >> 1)) *
+            (1 << (bitdepth - 8));
+      }
+    }
+  }
+
+  if (saturate) {
+    memset(params.auto_regression_coeff_y, 127,
+           sizeof(params.auto_regression_coeff_y));
+  }
+  if (compare) {
+    memcpy(base_luma_block_buffer_, luma_block_buffer_,
+           sizeof(luma_block_buffer_));
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    luma_auto_regression_func_(params, luma_block_buffer_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf("AutoRegressionLuma lag=%d, param_index=%d: %d us\n", coeff_lag,
+           param_index,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    base_luma_auto_regression_func_(params, base_luma_block_buffer_);
+    EXPECT_TRUE(test_utils::CompareBlocks(
+        luma_block_buffer_, base_luma_block_buffer_, kLumaWidth, kLumaHeight,
+        kLumaWidth, kLumaWidth, false));
+  } else {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("AutoRegressionLuma lag=%d, param_index=%d", coeff_lag,
+                        param_index)
+            .c_str(),
+        GetARTestDigestLuma(bitdepth, coeff_lag, param_index),
+        luma_block_buffer_, sizeof(luma_block_buffer_), elapsed_time);
+  }
+}
+
+using AutoRegressionTestLuma8bpp = AutoRegressionTestLuma<8>;
+
+TEST_P(AutoRegressionTestLuma8bpp, AutoRegressiveFilterLuma) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/false,
+                               /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma8bpp, AutoRegressiveFilterLumaSaturated) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/true,
+                               /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma8bpp, DISABLED_Speed) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1e5,
+                               /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AutoRegressionTestLuma10bpp = AutoRegressionTestLuma<10>;
+
+TEST_P(AutoRegressionTestLuma10bpp, AutoRegressiveFilterLuma) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/false,
+                               /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma10bpp, AutoRegressiveFilterLumaSaturated) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/true,
+                               /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma10bpp, DISABLED_Speed) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1e5,
+                               /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AutoRegressionTestLuma12bpp = AutoRegressionTestLuma<12>;
+
+TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLuma) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/false,
+                               /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLumaSaturated) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/true,
+                               /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma12bpp, DISABLED_Speed) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1e5,
+                               /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AutoRegressionTestLuma8bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AutoRegressionTestLuma8bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(
+    C, AutoRegressionTestLuma10bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AutoRegressionTestLuma10bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(
+    C, AutoRegressionTestLuma12bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct AutoRegressionChromaTestParam {
+  explicit AutoRegressionChromaTestParam(const std::tuple<int, int>& in)
+      : coeff_lag(std::get<0>(in)) {
+    switch (std::get<1>(in)) {
+      case 0:
+        subsampling_x = 0;
+        subsampling_y = 0;
+        break;
+      case 1:
+        subsampling_x = 1;
+        subsampling_y = 0;
+        break;
+      default:
+        assert(std::get<1>(in) == 2);
+        subsampling_x = 1;
+        subsampling_y = 1;
+    }
+  }
+  const int coeff_lag;
+  int subsampling_x;
+  int subsampling_y;
+};
+
+template <int bitdepth>
+class AutoRegressionTestChroma
+    : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  AutoRegressionTestChroma() {
+    AutoRegressionChromaTestParam test_param(GetParam());
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    // This test suite does not cover num_y_points == 0. This should be covered
+    // in the test of the full synthesis process.
+    base_chroma_auto_regression_func_ =
+        dsp->film_grain.chroma_auto_regression[1][test_param.coeff_lag];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_chroma_auto_regression_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    chroma_auto_regression_func_ =
+        dsp->film_grain.chroma_auto_regression[1][test_param.coeff_lag];
+  }
+
+  ~AutoRegressionTestChroma() override = default;
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestAutoRegressiveFilterChroma(int coeff_lag, int subsampling_x,
+                                      int subsampling_y, int num_runs,
+                                      bool saturate, bool compare);
+  ChromaAutoRegressionFunc chroma_auto_regression_func_;
+  ChromaAutoRegressionFunc base_chroma_auto_regression_func_;
+  GrainType luma_block_buffer_[kLumaBlockSize];
+  GrainType u_block_buffer_[kChromaBlockSize];
+  GrainType v_block_buffer_[kChromaBlockSize];
+  GrainType base_u_block_buffer_[kChromaBlockSize];
+  GrainType base_v_block_buffer_[kChromaBlockSize];
+};
+
+template <int bitdepth>
+void AutoRegressionTestChroma<bitdepth>::TestAutoRegressiveFilterChroma(
+    int coeff_lag, int subsampling_x, int subsampling_y, int num_runs,
+    bool saturate, bool compare) {
+  if (chroma_auto_regression_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_chroma_auto_regression_func_ == nullptr && compare) return;
+
+  // This function relies on the first set of sampled params for basics. The
+  // test param generators are used for coverage.
+  FilmGrainParams params = kFilmGrainParams[0];
+  params.auto_regression_coeff_lag = coeff_lag;
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int chroma_width =
+      (subsampling_x != 0) ? kMinChromaWidth : kMaxChromaWidth;
+  const int chroma_height =
+      (subsampling_y != 0) ? kMinChromaHeight : kMaxChromaHeight;
+  if (saturate) {
+    memset(params.auto_regression_coeff_u, 127,
+           sizeof(params.auto_regression_coeff_u));
+    memset(params.auto_regression_coeff_v, 127,
+           sizeof(params.auto_regression_coeff_v));
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        // This loop relies on the fact that kMaxChromaWidth == kLumaWidth.
+        luma_block_buffer_[y * kLumaWidth + x] = grain_max;
+        u_block_buffer_[y * kLumaWidth + x] = grain_max;
+        v_block_buffer_[y * kLumaWidth + x] = grain_max;
+      }
+    }
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    // Allow any valid grain values.
+    const int random_range = grain_max - grain_min + 1;
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        // This loop relies on the fact that kMaxChromaWidth == kLumaWidth.
+        const int random_y = rnd(random_range);
+        luma_block_buffer_[y * kLumaWidth + x] = random_y + grain_min;
+        const int random_u = rnd(random_range);
+        u_block_buffer_[y * kLumaWidth + x] = random_u + grain_min;
+        const int random_v = rnd(random_range);
+        v_block_buffer_[y * kLumaWidth + x] = random_v + grain_min;
+      }
+    }
+  }
+  if (compare) {
+    memcpy(base_u_block_buffer_, u_block_buffer_, sizeof(u_block_buffer_));
+    memcpy(base_v_block_buffer_, v_block_buffer_, sizeof(v_block_buffer_));
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    chroma_auto_regression_func_(params, luma_block_buffer_, subsampling_x,
+                                 subsampling_y, u_block_buffer_,
+                                 v_block_buffer_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf("AutoRegressionChroma lag=%d, sub_x=%d, sub_y=%d: %d us\n",
+           coeff_lag, subsampling_x, subsampling_y,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    base_chroma_auto_regression_func_(params, luma_block_buffer_, subsampling_x,
+                                      subsampling_y, base_u_block_buffer_,
+                                      base_v_block_buffer_);
+    EXPECT_TRUE(test_utils::CompareBlocks(u_block_buffer_, base_u_block_buffer_,
+                                          chroma_width, chroma_height,
+                                          chroma_width, chroma_width, false));
+    EXPECT_TRUE(test_utils::CompareBlocks(v_block_buffer_, base_v_block_buffer_,
+                                          chroma_width, chroma_height,
+                                          chroma_width, chroma_width, false));
+  } else {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("AutoRegressionChromaU lag=%d, sub_x=%d, sub_y=%d",
+                        coeff_lag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetARTestDigestChromaU(bitdepth, coeff_lag, subsampling_x,
+                               subsampling_y),
+        u_block_buffer_, sizeof(u_block_buffer_), elapsed_time);
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("AutoRegressionChromaV lag=%d, sub_x=%d, sub_y=%d",
+                        coeff_lag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetARTestDigestChromaV(bitdepth, coeff_lag, subsampling_x,
+                               subsampling_y),
+        v_block_buffer_, sizeof(v_block_buffer_), elapsed_time);
+  }
+}
+
+using AutoRegressionTestChroma8bpp = AutoRegressionTestChroma<8>;
+
+TEST_P(AutoRegressionTestChroma8bpp, AutoRegressiveFilterChroma) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1,
+                                 /*saturate=*/false,
+                                 /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma8bpp, AutoRegressiveFilterChromaSaturated) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1, /*saturate=*/true,
+                                 /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma8bpp, DISABLED_Speed) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(
+      test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+      // Subsampling cuts each dimension of the chroma blocks in half, so run
+      // twice as many times to compensate.
+      1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+      /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AutoRegressionTestChroma10bpp = AutoRegressionTestChroma<10>;
+
+TEST_P(AutoRegressionTestChroma10bpp, AutoRegressiveFilterChroma) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1,
+                                 /*saturate=*/false,
+                                 /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma10bpp, AutoRegressiveFilterChromaSaturated) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1, /*saturate=*/true,
+                                 /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma10bpp, DISABLED_Speed) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(
+      test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+      // Subsampling cuts each dimension of the chroma blocks in half, so run
+      // twice as many times to compensate.
+      1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+      /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AutoRegressionTestChroma12bpp = AutoRegressionTestChroma<12>;
+
+TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChroma) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1,
+                                 /*saturate=*/false,
+                                 /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChromaSaturated) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1, /*saturate=*/true,
+                                 /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma12bpp, DISABLED_Speed) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(
+      test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+      // Subsampling cuts each dimension of the chroma blocks in half, so run
+      // twice as many times to compensate.
+      1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+      /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma8bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma10bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma12bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma8bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma10bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#endif  // LIBGAV1_ENABLE_NEON
+
+template <int bitdepth>
+class GrainGenerationTest : public testing::TestWithParam<int> {
+ protected:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  void TestGenerateGrainLuma(int param_index, int num_runs);
+
+  GrainType luma_block_buffer_[kLumaBlockSize];
+};
+
+template <int bitdepth>
+void GrainGenerationTest<bitdepth>::TestGenerateGrainLuma(int param_index,
+                                                          int num_runs) {
+  FilmGrainParams params = kFilmGrainParams[param_index];
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    FilmGrain<bitdepth>::GenerateLumaGrain(params, luma_block_buffer_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs == 1) {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("GenerateGrainLuma param_index=%d", param_index)
+            .c_str(),
+        GetGrainGenerationTestDigestLuma(bitdepth, param_index),
+        luma_block_buffer_, sizeof(luma_block_buffer_), elapsed_time);
+  } else {
+    printf("GenerateGrainLuma param_index=%d: %d us\n", param_index,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+}
+
+using GrainGenerationTest8bpp = GrainGenerationTest<8>;
+
+TEST_P(GrainGenerationTest8bpp, GenerateGrainLuma) {
+  TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest8bpp, DISABLED_LumaSpeed) {
+  TestGenerateGrainLuma(GetParam(), 1e5);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using GrainGenerationTest10bpp = GrainGenerationTest<10>;
+
+TEST_P(GrainGenerationTest10bpp, GenerateGrainLuma) {
+  TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest10bpp, DISABLED_LumaSpeed) {
+  TestGenerateGrainLuma(GetParam(), 1e5);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using GrainGenerationTest12bpp = GrainGenerationTest<12>;
+
+TEST_P(GrainGenerationTest12bpp, GenerateGrainLuma) {
+  TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest12bpp, DISABLED_LumaSpeed) {
+  TestGenerateGrainLuma(GetParam(), 1e5);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest8bpp,
+                         testing::Range(0, 10) /* param_index */);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest10bpp,
+                         testing::Range(0, 10) /* param_index */);
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest12bpp,
+                         testing::Range(0, 10) /* param_index */);
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+// This param type is used for both ConstructStripesTest and
+// ConstructImageTest.
+struct ConstructNoiseTestParam {
+  explicit ConstructNoiseTestParam(const std::tuple<int, int>& in)
+      : overlap_flag(std::get<0>(in)) {
+    switch (std::get<1>(in)) {
+      case 0:
+        subsampling_x = 0;
+        subsampling_y = 0;
+        break;
+      case 1:
+        subsampling_x = 1;
+        subsampling_y = 0;
+        break;
+      default:
+        assert(std::get<1>(in) == 2);
+        subsampling_x = 1;
+        subsampling_y = 1;
+    }
+  }
+  const int overlap_flag;
+  int subsampling_x;
+  int subsampling_y;
+};
+
+template <int bitdepth>
+class ConstructStripesTest
+    : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  ConstructStripesTest() {
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    base_construct_noise_stripes_func_ =
+        dsp->film_grain.construct_noise_stripes[std::get<0>(GetParam())];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_construct_noise_stripes_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    construct_noise_stripes_func_ =
+        dsp->film_grain.construct_noise_stripes[std::get<0>(GetParam())];
+  }
+
+  ~ConstructStripesTest() override = default;
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestConstructNoiseStripes(int overlap_flag, int subsampling_x,
+                                 int subsampling_y, int num_runs, bool saturate,
+                                 bool compare);
+  ConstructNoiseStripesFunc construct_noise_stripes_func_;
+  ConstructNoiseStripesFunc base_construct_noise_stripes_func_;
+  GrainType grain_buffer_[kLumaBlockSize];
+  Array2DView<GrainType> noise_stripes_;
+  // Owns the memory that noise_stripes_ points to.
+  std::unique_ptr<GrainType[]> stripe_buffer_;
+  Array2DView<GrainType> base_noise_stripes_;
+  // Owns the memory that base_stripe_buffer_ points to.
+  std::unique_ptr<GrainType[]> base_stripe_buffer_;
+};
+
+template <int bitdepth>
+void ConstructStripesTest<bitdepth>::TestConstructNoiseStripes(
+    int overlap_flag, int subsampling_x, int subsampling_y, int num_runs,
+    bool saturate, bool compare) {
+  if (construct_noise_stripes_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_construct_noise_stripes_func_ == nullptr && compare) return;
+
+  const int stripe_width = ((kFrameWidth + subsampling_x) >> subsampling_x);
+  const int stripe_height = kNoiseStripeHeight;
+  const int stripe_size = stripe_height * stripe_width;
+  const int stripe_buffer_size = stripe_size * kNumTestStripes;
+  if (compare) {
+    base_stripe_buffer_.reset(new (
+        std::nothrow) GrainType[stripe_buffer_size + kNoiseStripePadding]());
+    ASSERT_NE(base_stripe_buffer_, nullptr);
+    base_noise_stripes_.Reset(kNumTestStripes, stripe_size,
+                              base_stripe_buffer_.get());
+  }
+  stripe_buffer_.reset(
+      new (std::nothrow) GrainType[stripe_buffer_size + kNoiseStripePadding]());
+  ASSERT_NE(stripe_buffer_, nullptr);
+  noise_stripes_.Reset(kNumTestStripes, stripe_size, stripe_buffer_.get());
+
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  if (saturate) {
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        grain_buffer_[y * kLumaWidth + x] = grain_max;
+      }
+    }
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    // Allow any valid grain values.
+    const int random_range = grain_max - grain_min + 1;
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        grain_buffer_[y * kLumaWidth + x] = grain_min + rnd(random_range);
+      }
+    }
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    construct_noise_stripes_func_(grain_buffer_, 68, kFrameWidth, kFrameHeight,
+                                  subsampling_x, subsampling_y,
+                                  &noise_stripes_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf(
+        "ConstructNoiseStripes Speed Test for overlap=%d, sub_x=%d, "
+        "sub_y=%d: %d us\n",
+        overlap_flag, subsampling_x, subsampling_y,
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    base_construct_noise_stripes_func_(grain_buffer_, 68, kFrameWidth,
+                                       kFrameHeight, subsampling_x,
+                                       subsampling_y, &base_noise_stripes_);
+
+    constexpr int kCompareWidth = 64;
+    for (int stripe = 0; stripe < kNumTestStripes;) {
+      EXPECT_TRUE(test_utils::CompareBlocks(
+          noise_stripes_[stripe], base_noise_stripes_[stripe], kCompareWidth,
+          stripe_height, stripe_width, stripe_width, /*check_padding=*/false,
+          /*print_diff=*/false));
+    }
+  } else {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("ConstructNoiseStripes overlap=%d, sub_x=%d, sub_y=%d",
+                        overlap_flag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetConstructStripesTestDigest(bitdepth, overlap_flag, subsampling_x,
+                                      subsampling_y),
+        noise_stripes_[0], stripe_buffer_size, elapsed_time);
+  }
+}
+
+using ConstructStripesTest8bpp = ConstructStripesTest<8>;
+
+TEST_P(ConstructStripesTest8bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructStripesTest8bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/true, /*compare=*/true);
+}
+TEST_P(ConstructStripesTest8bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/500,
+                            /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConstructStripesTest10bpp = ConstructStripesTest<10>;
+
+TEST_P(ConstructStripesTest10bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/false, /*compare=*/false);
+}
+TEST_P(ConstructStripesTest10bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructStripesTest10bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/500,
+                            /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConstructStripesTest12bpp = ConstructStripesTest<12>;
+
+TEST_P(ConstructStripesTest12bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/false, /*compare=*/false);
+}
+TEST_P(ConstructStripesTest12bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructStripesTest12bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/500,
+                            /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest12bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth>
+class ConstructImageTest : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  ConstructImageTest() {
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    base_construct_noise_image_overlap_func_ =
+        dsp->film_grain.construct_noise_image_overlap;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_construct_noise_image_overlap_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    construct_noise_image_overlap_func_ =
+        dsp->film_grain.construct_noise_image_overlap;
+  }
+
+  ~ConstructImageTest() override = default;
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestConstructNoiseImage(int overlap_flag, int subsampling_x,
+                               int subsampling_y, int num_runs, bool saturate,
+                               bool compare);
+  ConstructNoiseImageOverlapFunc construct_noise_image_overlap_func_;
+  ConstructNoiseImageOverlapFunc base_construct_noise_image_overlap_func_;
+  Array2DView<GrainType> noise_stripes_;
+  // Owns the memory that noise_stripes_ points to.
+  std::unique_ptr<GrainType[]> stripe_buffer_;
+  Array2D<GrainType> noise_image_;
+  Array2D<GrainType> base_noise_image_;
+};
+
+template <int bitdepth>
+void ConstructImageTest<bitdepth>::TestConstructNoiseImage(
+    int overlap_flag, int subsampling_x, int subsampling_y, int num_runs,
+    bool saturate, bool compare) {
+  if (construct_noise_image_overlap_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_construct_noise_image_overlap_func_ == nullptr && compare) return;
+
+  const int image_width = ((kFrameWidth + subsampling_x) >> subsampling_x);
+  const int image_height = ((kFrameHeight + subsampling_y) >> subsampling_y);
+  const int stripe_height =
+      ((kNoiseStripeHeight + subsampling_y) >> subsampling_y);
+  const int image_stride = image_width + kNoiseImagePadding;
+  const int stripe_size = stripe_height * image_width;
+  if (compare) {
+    ASSERT_TRUE(base_noise_image_.Reset(image_height, image_stride,
+                                        /*zero_initialize=*/false));
+  }
+  ASSERT_TRUE(noise_image_.Reset(image_height, image_stride,
+                                 /*zero_initialize=*/false));
+  // Stride between stripe rows is |image_width|. Padding is only at the
+  // end of the final row of the final stripe to protect from overreads.
+  stripe_buffer_.reset(
+      new (std::nothrow)
+          GrainType[kNumTestStripes * stripe_size + kNoiseStripePadding]);
+  ASSERT_NE(stripe_buffer_, nullptr);
+  noise_stripes_.Reset(kNumTestStripes, stripe_size, stripe_buffer_.get());
+
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  if (saturate) {
+    for (int i = 0; i < stripe_size; ++i) {
+      noise_stripes_[0][i] = grain_max;
+    }
+    for (int stripe = 1; stripe < kNumTestStripes; ++stripe) {
+      memcpy(noise_stripes_[stripe], noise_stripes_[0],
+             stripe_size * sizeof(noise_stripes_[0][0]));
+    }
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    // Allow any valid grain values.
+    const int random_range = grain_max - grain_min + 1;
+    for (int stripe = 0; stripe < kNumTestStripes; ++stripe) {
+      // Assign all allocated memory for this stripe.
+      for (int i = 0; i < stripe_height; ++i) {
+        for (int x = 0; x < image_width; ++x) {
+          noise_stripes_[stripe][i * image_width + x] =
+              grain_min + rnd(random_range);
+        }
+      }
+    }
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    FilmGrain<bitdepth>::ConstructNoiseImage(
+        &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+        subsampling_y, overlap_flag << (1 - subsampling_y), &noise_image_);
+    if (overlap_flag == 1) {
+      construct_noise_image_overlap_func_(&noise_stripes_, kFrameWidth,
+                                          kFrameHeight, subsampling_x,
+                                          subsampling_y, &noise_image_);
+    }
+  }
+
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf(
+        "ConstructNoiseImage Speed Test for overlap=%d, sub_x=%d, "
+        "sub_y=%d: %d us\n",
+        overlap_flag, subsampling_x, subsampling_y,
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    FilmGrain<bitdepth>::ConstructNoiseImage(
+        &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+        subsampling_y, overlap_flag << (1 - subsampling_y), &base_noise_image_);
+    if (overlap_flag == 1) {
+      base_construct_noise_image_overlap_func_(
+          &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+          subsampling_y, &base_noise_image_);
+    }
+    constexpr int kCompareWidth = 72;
+    constexpr int kCompareHeight = 72;
+    EXPECT_TRUE(test_utils::CompareBlocks(
+        noise_image_[0], base_noise_image_[0], kCompareWidth, kCompareHeight,
+        image_stride, image_stride, /*check_padding=*/false,
+        /*print_diff=*/false));
+  } else {
+    printf("BD%d \"%s\",\n", bitdepth,
+           test_utils::GetMd5Sum(noise_image_[0], image_width, image_height,
+                                 image_stride)
+               .c_str());
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("ConstructNoiseImage overlap=%d, sub_x=%d, sub_y=%d",
+                        overlap_flag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetConstructImageTestDigest(bitdepth, overlap_flag, subsampling_x,
+                                    subsampling_y),
+        noise_image_[0], image_width, image_height, image_stride, elapsed_time);
+  }
+}
+
+using ConstructImageTest8bpp = ConstructImageTest<8>;
+
+TEST_P(ConstructImageTest8bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest8bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest8bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/500,
+                          /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConstructImageTest10bpp = ConstructImageTest<10>;
+
+TEST_P(ConstructImageTest10bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest10bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest10bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/500,
+                          /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConstructImageTest12bpp = ConstructImageTest<12>;
+
+TEST_P(ConstructImageTest12bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest12bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest12bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/500,
+                          /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConstructImageTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest12bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth>
+class ScalingLookupTableTest : public testing::TestWithParam<int> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  ScalingLookupTableTest() {
+    test_utils::ResetDspTable(bitdepth);
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    initialize_func_ = dsp->film_grain.initialize_scaling_lut;
+  }
+  ~ScalingLookupTableTest() override = default;
+
+ protected:
+  void TestSpeed(int num_runs);
+  void ZeroPoints();
+
+ private:
+  static constexpr int kScalingLutBufferLength =
+      (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8);
+  dsp::InitializeScalingLutFunc initialize_func_;
+  int16_t scaling_lut_[kScalingLutBufferLength];
+};
+
+template <int bitdepth>
+void ScalingLookupTableTest<bitdepth>::TestSpeed(int num_runs) {
+  if (initialize_func_ == nullptr) return;
+  const int param_index = GetParam();
+  const FilmGrainParams& params = kFilmGrainParams[param_index];
+  const absl::Time start = absl::Now();
+  Memset(scaling_lut_, 0, kScalingLutBufferLength);
+  for (int i = 0; i < num_runs; ++i) {
+    initialize_func_(params.num_y_points, params.point_y_value,
+                     params.point_y_scaling, scaling_lut_,
+                     kScalingLutBufferLength);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf("InitializeScalingLut: %d us\n",
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  test_utils::CheckMd5Digest(
+      "FilmGrain",
+      absl::StrFormat("InitializeScalingLut for param set: %d", param_index)
+          .c_str(),
+      GetScalingInitTestDigest(param_index, bitdepth), scaling_lut_,
+      (sizeof(scaling_lut_[0]) * kScalingLookupTableSize) << (bitdepth - 8),
+      elapsed_time);
+}
+
+template <int bitdepth>
+void ScalingLookupTableTest<bitdepth>::ZeroPoints() {
+  if (initialize_func_ == nullptr) return;
+  const int param_index = GetParam();
+  const FilmGrainParams& params = kFilmGrainParams[param_index];
+  initialize_func_(0, params.point_y_value, params.point_y_scaling,
+                   scaling_lut_, kScalingLookupTableSize);
+  for (int i = 0; i < kScalingLookupTableSize; ++i) {
+    ASSERT_EQ(scaling_lut_[i], 0);
+  }
+}
+
+using ScalingLookupTableTest8bpp = ScalingLookupTableTest<8>;
+
+TEST_P(ScalingLookupTableTest8bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest8bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest8bpp, DISABLED_Speed) {
+  TestSpeed(/*num_runs=*/1e5);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ScalingLookupTableTest10bpp = ScalingLookupTableTest<10>;
+
+TEST_P(ScalingLookupTableTest10bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest10bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest10bpp, DISABLED_Speed) {
+  TestSpeed(/*num_runs=*/1e5);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ScalingLookupTableTest12bpp = ScalingLookupTableTest<12>;
+
+TEST_P(ScalingLookupTableTest12bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest12bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest12bpp, DISABLED_Speed) {
+  TestSpeed(/*num_runs=*/1e5);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest8bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest8bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest10bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest10bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest12bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+struct BlendNoiseTestParam {
+  explicit BlendNoiseTestParam(const std::tuple<int, int>& in)
+      : chroma_scaling_from_luma(std::get<0>(in)) {
+    switch (std::get<1>(in)) {
+      case 0:
+        subsampling_x = 0;
+        subsampling_y = 0;
+        break;
+      case 1:
+        subsampling_x = 1;
+        subsampling_y = 0;
+        break;
+      default:
+        assert(std::get<1>(in) == 2);
+        subsampling_x = 1;
+        subsampling_y = 1;
+    }
+  }
+  const int chroma_scaling_from_luma;
+  int subsampling_x;
+  int subsampling_y;
+};
+
+template <int bitdepth, typename Pixel>
+class BlendNoiseTest : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+  ~BlendNoiseTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      FilmGrainInit_SSE4_1();
+    }
+    const BlendNoiseTestParam test_param(GetParam());
+    chroma_scaling_from_luma_ = test_param.chroma_scaling_from_luma;
+    blend_luma_func_ = dsp->film_grain.blend_noise_luma;
+    blend_chroma_func_ =
+        dsp->film_grain.blend_noise_chroma[chroma_scaling_from_luma_];
+    subsampling_x_ = test_param.subsampling_x;
+    subsampling_y_ = test_param.subsampling_y;
+
+    uv_width_ = (width_ + subsampling_x_) >> subsampling_x_;
+    uv_height_ = (height_ + subsampling_y_) >> subsampling_y_;
+    uv_stride_ = uv_width_ * sizeof(Pixel);
+    y_stride_ = width_ * sizeof(Pixel);
+    const size_t buffer_size =
+        sizeof(Pixel) * (width_ * height_ + 2 * uv_width_ * uv_height_ +
+                         3 * kBorderPixelsFilmGrain);
+    source_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(source_buffer_.get(), 0, sizeof(source_buffer_[0]) * buffer_size);
+    dest_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(dest_buffer_.get(), 0, sizeof(dest_buffer_[0]) * buffer_size);
+    source_plane_y_ = source_buffer_.get();
+    source_plane_u_ =
+        source_plane_y_ + y_stride_ * height_ + kBorderPixelsFilmGrain;
+    source_plane_v_ =
+        source_plane_u_ + uv_stride_ * uv_height_ + kBorderPixelsFilmGrain;
+    dest_plane_y_ = dest_buffer_.get();
+    dest_plane_u_ =
+        dest_plane_y_ + y_stride_ * height_ + kBorderPixelsFilmGrain;
+    dest_plane_v_ =
+        dest_plane_u_ + uv_stride_ * uv_height_ + kBorderPixelsFilmGrain;
+  }
+
+  void TestSpeed(int num_runs);
+
+ private:
+  static constexpr int kScalingLutBufferLength =
+      (kScalingLookupTableSize + kScalingLookupTablePadding) << 2;
+
+  void ConvertScalingLut10bpp(int16_t* scaling_lut_10bpp,
+                              const int16_t* src_scaling_lut);
+  dsp::BlendNoiseWithImageLumaFunc blend_luma_func_;
+  dsp::BlendNoiseWithImageChromaFunc blend_chroma_func_;
+
+  const int width_ = 1921;
+  const int height_ = 1081;
+  int chroma_scaling_from_luma_ = 0;
+  int subsampling_x_ = 0;
+  int subsampling_y_ = 0;
+  int uv_width_ = 0;
+  int uv_height_ = 0;
+  int uv_stride_ = 0;
+  int y_stride_ = 0;
+  // This holds the data that |source_plane_y_|, |source_plane_u_|, and
+  // |source_plane_v_| point to.
+  std::unique_ptr<uint8_t[]> source_buffer_;
+  // This holds the data that |dest_plane_y_|, |dest_plane_u_|, and
+  // |dest_plane_v_| point to.
+  std::unique_ptr<uint8_t[]> dest_buffer_;
+  uint8_t* source_plane_y_ = nullptr;
+  uint8_t* source_plane_u_ = nullptr;
+  uint8_t* source_plane_v_ = nullptr;
+  uint8_t* dest_plane_y_ = nullptr;
+  uint8_t* dest_plane_u_ = nullptr;
+  uint8_t* dest_plane_v_ = nullptr;
+  Array2D<GrainType> noise_image_[kMaxPlanes];
+  int16_t scaling_lut_10bpp_y_[kScalingLutBufferLength];
+  int16_t scaling_lut_10bpp_u_[kScalingLutBufferLength];
+  int16_t scaling_lut_10bpp_v_[kScalingLutBufferLength];
+};
+
+template <int bitdepth, typename Pixel>
+void BlendNoiseTest<bitdepth, Pixel>::ConvertScalingLut10bpp(
+    int16_t* scaling_lut_10bpp, const int16_t* src_scaling_lut) {
+  for (int i = 0; i < kScalingLookupTableSize - 1; ++i) {
+    const int x_base = i << 2;
+    const int start = src_scaling_lut[i];
+    const int end_index = std::min(i + 1, kScalingLookupTableSize - 1);
+    const int end = src_scaling_lut[end_index];
+    const int delta = end - start;
+    scaling_lut_10bpp[x_base] = start;
+    scaling_lut_10bpp[x_base + 1] = start + RightShiftWithRounding(delta, 2);
+    scaling_lut_10bpp[x_base + 2] =
+        start + RightShiftWithRounding(2 * delta, 2);
+    scaling_lut_10bpp[x_base + 3] =
+        start + RightShiftWithRounding(3 * delta, 2);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void BlendNoiseTest<bitdepth, Pixel>::TestSpeed(const int num_runs) {
+  if (blend_chroma_func_ == nullptr || blend_luma_func_ == nullptr) return;
+  // Allow optimized code to read into the border without generating MSan
+  // warnings. This matches the behavior in FilmGrain::AllocateNoiseImage().
+  constexpr bool zero_initialize = LIBGAV1_MSAN == 1;
+  ASSERT_TRUE(noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
+                                          zero_initialize));
+  ASSERT_TRUE(noise_image_[kPlaneU].Reset(
+      uv_height_, uv_width_ + kNoiseImagePadding, zero_initialize));
+  ASSERT_TRUE(noise_image_[kPlaneV].Reset(
+      uv_height_, uv_width_ + kNoiseImagePadding, zero_initialize));
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  // Allow any valid grain values.
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int random_range = grain_max - grain_min + 1;
+  auto* src_y = reinterpret_cast<Pixel*>(source_plane_y_);
+  auto* src_u = reinterpret_cast<Pixel*>(source_plane_u_);
+  auto* src_v = reinterpret_cast<Pixel*>(source_plane_v_);
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      const int random_source_y = rnd(random_range);
+      // Populating the luma source ensures the lookup table is tested. Chroma
+      // planes are given identical values. Giving them different values would
+      // artificially differentiate the outputs. It's important that the test
+      // expect that different outputs are caused by the different scaling
+      // lookup tables, rather than by different inputs.
+      const int uv_y_pos = y >> subsampling_y_;
+      const int uv_x_pos = x >> subsampling_x_;
+      src_y[y * width_ + x] = random_source_y;
+      src_u[uv_y_pos * uv_width_ + uv_x_pos] = random_source_y;
+      src_v[uv_y_pos * uv_width_ + uv_x_pos] = random_source_y;
+      const int random_y = rnd(random_range);
+      noise_image_[kPlaneY][y][x] = random_y + grain_min;
+      const int random_u = rnd(random_range);
+      noise_image_[kPlaneU][uv_y_pos][uv_x_pos] = random_u + grain_min;
+      const int random_v = rnd(random_range);
+      noise_image_[kPlaneV][uv_y_pos][uv_x_pos] = random_v + grain_min;
+    }
+  }
+  static constexpr int16_t kTestScalingLutY[kScalingLookupTableSize] = {
+      72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  73,
+      75,  76,  77,  79,  80,  81,  83,  84,  86,  87,  88,  90,  91,  92,  92,
+      93,  93,  94,  95,  95,  96,  97,  97,  98,  98,  99,  99,  99,  99,  98,
+      98,  98,  98,  98,  98,  98,  97,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  100, 100,
+      100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102,
+  };
+  static constexpr int16_t kTestScalingLutU[kScalingLookupTableSize] = {
+      30,  42,  53,  65,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+      75,  76,  78,  79,  81,  82,  83,  85,  86,  88,  89,  91,  92,  93,  93,
+      94,  94,  95,  95,  96,  96,  97,  97,  98,  98,  99,  99,  99,  99,  99,
+      99,  99,  99,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  99,  99,
+      99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,
+      99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,
+      99,  99,  99,  99,  99,  99,  100, 100, 100, 100, 100, 100, 100, 100, 100,
+      100, 100, 100, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
+      110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
+      98,  98,  98,  98,  98,  98,  98,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  96,  96,  96,  96,  96,
+      96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,
+      96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  95,
+      95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,
+      95,  95,
+  };
+  static constexpr int16_t kTestScalingLutV[kScalingLookupTableSize] = {
+      73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  74,  74,  74,
+      75,  75,  78,  79,  81,  82,  83,  85,  86,  88,  89,  91,  92,  93,  93,
+      94,  94,  95,  95,  96,  96,  97,  97,  98,  98,  99,  99,  99,  99,  98,
+      98,  98,  98,  98,  98,  98,  97,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  97,  97,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      98,  98,  98,  98,  98,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,
+      99,  99,  99,  99,  99,  99,  100, 100, 100, 100, 100, 100, 100, 100, 100,
+      100, 100, 100, 100, 100, 100, 100, 100, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150,
+      180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
+      200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
+      255, 255,
+  };
+
+  if (bitdepth == 10) {
+    for (int i = 0; i < kScalingLutBufferLength; ++i) {
+      ConvertScalingLut10bpp(scaling_lut_10bpp_y_, kTestScalingLutY);
+      ConvertScalingLut10bpp(scaling_lut_10bpp_u_, kTestScalingLutU);
+      ConvertScalingLut10bpp(scaling_lut_10bpp_v_, kTestScalingLutV);
+    }
+  }
+  const FilmGrainParams& params = kFilmGrainParams[0];
+  const int min_value = 16 << (bitdepth - 8);
+  const int max_value = 235 << (bitdepth - 8);
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    if (chroma_scaling_from_luma_) {
+      blend_chroma_func_(
+          kPlaneU, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+          source_plane_y_, y_stride_, source_plane_u_, uv_stride_,
+          dest_plane_u_, uv_stride_);
+      blend_chroma_func_(
+          kPlaneV, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+          source_plane_y_, y_stride_, source_plane_v_, uv_stride_,
+          dest_plane_v_, uv_stride_);
+    } else {
+      blend_chroma_func_(
+          kPlaneU, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_u_ : kTestScalingLutU,
+          source_plane_y_, y_stride_, source_plane_u_, uv_stride_,
+          dest_plane_u_, uv_stride_);
+      blend_chroma_func_(
+          kPlaneV, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_v_ : kTestScalingLutV,
+          source_plane_y_, y_stride_, source_plane_v_, uv_stride_,
+          dest_plane_v_, uv_stride_);
+    }
+    blend_luma_func_(noise_image_, min_value, max_value, params.chroma_scaling,
+                     width_, height_, /*start_height=*/0,
+                     (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+                     source_plane_y_, y_stride_, dest_plane_y_, y_stride_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  const char* digest_luma = GetBlendLumaTestDigest(bitdepth);
+  printf("YBD%d \"%s\",\n", bitdepth,
+         test_utils::GetMd5Sum(dest_plane_y_, y_stride_ * height_).c_str());
+  printf("UBD%d \"%s\",\n", bitdepth,
+         test_utils::GetMd5Sum(dest_plane_u_, uv_stride_ * uv_height_).c_str());
+  printf("VBD%d \"%s\",\n", bitdepth,
+         test_utils::GetMd5Sum(dest_plane_v_, uv_stride_ * uv_height_).c_str());
+  test_utils::CheckMd5Digest(
+      "BlendNoiseWithImage",
+      absl::StrFormat("Luma cfl=%d, sub_x=%d, sub_y=%d",
+                      chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+          .c_str(),
+      digest_luma, dest_plane_y_, y_stride_ * height_, elapsed_time);
+  const char* digest_chroma_u = GetBlendChromaUTestDigest(
+      bitdepth, chroma_scaling_from_luma_, subsampling_x_, subsampling_y_);
+  test_utils::CheckMd5Digest(
+      "BlendNoiseWithImage",
+      absl::StrFormat("ChromaU cfl=%d, sub_x=%d, sub_y=%d",
+                      chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+          .c_str(),
+      digest_chroma_u, dest_plane_u_, uv_stride_ * uv_height_, elapsed_time);
+  const char* digest_chroma_v = GetBlendChromaVTestDigest(
+      bitdepth, chroma_scaling_from_luma_, subsampling_x_, subsampling_y_);
+  test_utils::CheckMd5Digest(
+      "BlendNoiseWithImage",
+      absl::StrFormat("ChromaV cfl=%d, sub_x=%d, sub_y=%d",
+                      chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+          .c_str(),
+      digest_chroma_v, dest_plane_v_, uv_stride_ * uv_height_, elapsed_time);
+}
+
+using BlendNoiseTest8bpp = BlendNoiseTest<8, uint8_t>;
+
+TEST_P(BlendNoiseTest8bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest8bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, BlendNoiseTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using BlendNoiseTest10bpp = BlendNoiseTest<10, uint16_t>;
+
+TEST_P(BlendNoiseTest10bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest10bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, BlendNoiseTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using BlendNoiseTest12bpp = BlendNoiseTest<12, uint16_t>;
+
+TEST_P(BlendNoiseTest12bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest12bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth, typename Pixel>
+class FilmGrainSpeedTest : public testing::TestWithParam<int> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  ~FilmGrainSpeedTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    FilmGrainInit_C();
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+      FilmGrainInit_SSE4_1();
+    }
+    uv_width_ = (width_ + subsampling_x_) >> subsampling_x_;
+    uv_height_ = (height_ + subsampling_y_) >> subsampling_y_;
+    uv_stride_ = uv_width_ * sizeof(Pixel);
+    y_stride_ = width_ * sizeof(Pixel);
+    const size_t buffer_size =
+        sizeof(Pixel) * (width_ * height_ + 2 * uv_width_ * uv_height_);
+    source_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(source_buffer_.get(), 0, sizeof(source_buffer_[0]) * buffer_size);
+    dest_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(dest_buffer_.get(), 0, sizeof(dest_buffer_[0]) * buffer_size);
+    source_plane_y_ = source_buffer_.get();
+    source_plane_u_ = source_plane_y_ + y_stride_ * height_;
+    source_plane_v_ = source_plane_u_ + uv_stride_ * uv_height_;
+    dest_plane_y_ = dest_buffer_.get();
+    dest_plane_u_ = dest_plane_y_ + y_stride_ * height_;
+    dest_plane_v_ = dest_plane_u_ + uv_stride_ * uv_height_;
+    const int num_threads = GetParam();
+    thread_pool_ = ThreadPool::Create(num_threads);
+  }
+
+  void TestSpeed(int num_runs);
+
+ private:
+  const int width_ = 1920;
+  const int height_ = 1080;
+  const int subsampling_x_ = 1;
+  const int subsampling_y_ = 1;
+  int uv_width_ = 0;
+  int uv_height_ = 0;
+  int uv_stride_ = 0;
+  int y_stride_ = 0;
+  std::unique_ptr<uint8_t[]> source_buffer_;
+  std::unique_ptr<uint8_t[]> dest_buffer_;
+  const uint8_t* source_plane_y_ = nullptr;
+  const uint8_t* source_plane_u_ = nullptr;
+  const uint8_t* source_plane_v_ = nullptr;
+  uint8_t* dest_plane_y_ = nullptr;
+  uint8_t* dest_plane_u_ = nullptr;
+  uint8_t* dest_plane_v_ = nullptr;
+  std::unique_ptr<ThreadPool> thread_pool_;
+};
+
+// Each run of the speed test adds film grain noise to 10 dummy frames. The
+// film grain parameters for the 10 frames were generated with aomenc.
+template <int bitdepth, typename Pixel>
+void FilmGrainSpeedTest<bitdepth, Pixel>::TestSpeed(const int num_runs) {
+  const dsp::Dsp* dsp = GetDspTable(bitdepth);
+  if (dsp->film_grain.blend_noise_chroma[0] == nullptr ||
+      dsp->film_grain.blend_noise_luma == nullptr) {
+    return;
+  }
+  for (int k = 0; k < kNumFilmGrainTestParams; ++k) {
+    const FilmGrainParams& params = kFilmGrainParams[k];
+    const absl::Time start = absl::Now();
+    for (int i = 0; i < num_runs; ++i) {
+      FilmGrain<bitdepth> film_grain(params, /*is_monochrome=*/false,
+                                     /*color_matrix_is_identity=*/false,
+                                     subsampling_x_, subsampling_y_, width_,
+                                     height_, thread_pool_.get());
+      EXPECT_TRUE(film_grain.AddNoise(
+          source_plane_y_, y_stride_, source_plane_u_, source_plane_v_,
+          uv_stride_, dest_plane_y_, y_stride_, dest_plane_u_, dest_plane_v_,
+          uv_stride_));
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    const char* digest_luma = GetTestDigestLuma(bitdepth, k);
+    test_utils::CheckMd5Digest(
+        "FilmGrainSynthesisLuma",
+        absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_luma,
+        dest_plane_y_, y_stride_ * height_, elapsed_time);
+    const char* digest_chroma_u = GetTestDigestChromaU(bitdepth, k);
+    test_utils::CheckMd5Digest(
+        "FilmGrainSynthesisChromaU",
+        absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_chroma_u,
+        dest_plane_u_, uv_stride_ * uv_height_, elapsed_time);
+    const char* digest_chroma_v = GetTestDigestChromaV(bitdepth, k);
+    test_utils::CheckMd5Digest(
+        "FilmGrainSynthesisChromaV",
+        absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_chroma_v,
+        dest_plane_v_, uv_stride_ * uv_height_, elapsed_time);
+  }
+}
+
+using FilmGrainSpeedTest8bpp = FilmGrainSpeedTest<8, uint8_t>;
+
+TEST_P(FilmGrainSpeedTest8bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest8bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest8bpp, testing::Values(0, 3, 8));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilmGrainSpeedTest8bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest8bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilmGrainSpeedTest10bpp = FilmGrainSpeedTest<10, uint16_t>;
+
+TEST_P(FilmGrainSpeedTest10bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest10bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest10bpp, testing::Values(0, 3, 8));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilmGrainSpeedTest10bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest10bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using FilmGrainSpeedTest12bpp = FilmGrainSpeedTest<12, uint16_t>;
+
+TEST_P(FilmGrainSpeedTest12bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest12bpp, testing::Values(0, 3, 8));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+}  // namespace film_grain
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/frame_buffer.cc b/src/frame_buffer.cc
new file mode 100644 (file)
index 0000000..50c7756
--- /dev/null
@@ -0,0 +1,151 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/frame_buffer.h"
+
+#include <cstdint>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBufferInfo* info) {
+  switch (bitdepth) {
+    case 8:
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    case 10:
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+    case 12:
+#endif
+      break;
+    default:
+      return kLibgav1StatusInvalidArgument;
+  }
+  switch (image_format) {
+    case kLibgav1ImageFormatYuv420:
+    case kLibgav1ImageFormatYuv422:
+    case kLibgav1ImageFormatYuv444:
+    case kLibgav1ImageFormatMonochrome400:
+      break;
+    default:
+      return kLibgav1StatusInvalidArgument;
+  }
+  // All int arguments must be nonnegative. Borders must be a multiple of 2.
+  // |stride_alignment| must be a power of 2.
+  if ((width | height | left_border | right_border | top_border |
+       bottom_border | stride_alignment) < 0 ||
+      ((left_border | right_border | top_border | bottom_border) & 1) != 0 ||
+      (stride_alignment & (stride_alignment - 1)) != 0 || info == nullptr) {
+    return kLibgav1StatusInvalidArgument;
+  }
+
+  bool is_monochrome;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  libgav1::DecomposeImageFormat(image_format, &is_monochrome, &subsampling_x,
+                                &subsampling_y);
+
+  // Calculate y_stride (in bytes). It is padded to a multiple of
+  // |stride_alignment| bytes.
+  int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+  y_stride = libgav1::Align(y_stride, stride_alignment);
+  // Size of the Y buffer in bytes.
+  const uint64_t y_buffer_size =
+      (height + top_border + bottom_border) * static_cast<uint64_t>(y_stride) +
+      (stride_alignment - 1);
+
+  const int uv_width =
+      is_monochrome ? 0 : libgav1::SubsampledValue(width, subsampling_x);
+  const int uv_height =
+      is_monochrome ? 0 : libgav1::SubsampledValue(height, subsampling_y);
+  const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+  const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+  const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+  const int uv_bottom_border =
+      is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+  // Calculate uv_stride (in bytes). It is padded to a multiple of
+  // |stride_alignment| bytes.
+  int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+  uv_stride = libgav1::Align(uv_stride, stride_alignment);
+  // Size of the U or V buffer in bytes.
+  const uint64_t uv_buffer_size =
+      is_monochrome ? 0
+                    : (uv_height + uv_top_border + uv_bottom_border) *
+                              static_cast<uint64_t>(uv_stride) +
+                          (stride_alignment - 1);
+
+  // Check if it is safe to cast y_buffer_size and uv_buffer_size to size_t.
+  if (y_buffer_size > SIZE_MAX || uv_buffer_size > SIZE_MAX) {
+    return kLibgav1StatusInvalidArgument;
+  }
+
+  int left_border_bytes = left_border;
+  int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) {
+    left_border_bytes *= sizeof(uint16_t);
+    uv_left_border_bytes *= sizeof(uint16_t);
+  }
+#endif
+
+  info->y_stride = y_stride;
+  info->uv_stride = uv_stride;
+  info->y_buffer_size = static_cast<size_t>(y_buffer_size);
+  info->uv_buffer_size = static_cast<size_t>(uv_buffer_size);
+  info->y_plane_offset = top_border * y_stride + left_border_bytes;
+  info->uv_plane_offset = uv_top_border * uv_stride + uv_left_border_bytes;
+  info->stride_alignment = stride_alignment;
+  return kLibgav1StatusOk;
+}
+
+Libgav1StatusCode Libgav1SetFrameBuffer(const Libgav1FrameBufferInfo* info,
+                                        uint8_t* y_buffer, uint8_t* u_buffer,
+                                        uint8_t* v_buffer,
+                                        void* buffer_private_data,
+                                        Libgav1FrameBuffer* frame_buffer) {
+  if (info == nullptr ||
+      (info->uv_buffer_size == 0 &&
+       (u_buffer != nullptr || v_buffer != nullptr)) ||
+      frame_buffer == nullptr) {
+    return kLibgav1StatusInvalidArgument;
+  }
+  if (y_buffer == nullptr || (info->uv_buffer_size != 0 &&
+                              (u_buffer == nullptr || v_buffer == nullptr))) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  frame_buffer->plane[0] = libgav1::AlignAddr(y_buffer + info->y_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->plane[1] = libgav1::AlignAddr(u_buffer + info->uv_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->plane[2] = libgav1::AlignAddr(v_buffer + info->uv_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->stride[0] = info->y_stride;
+  frame_buffer->stride[1] = frame_buffer->stride[2] = info->uv_stride;
+  frame_buffer->private_data = buffer_private_data;
+  return kLibgav1StatusOk;
+}
+
+}  // extern "C"
diff --git a/src/frame_buffer_utils.h b/src/frame_buffer_utils.h
new file mode 100644 (file)
index 0000000..d41437e
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+#define LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// The following table is from Section 6.4.2 of the spec.
+//
+// subsampling_x  subsampling_y  mono_chrome  Description
+// -----------------------------------------------------------
+// 0              0              0            YUV 4:4:4
+// 1              0              0            YUV 4:2:2
+// 1              1              0            YUV 4:2:0
+// 1              1              1            Monochrome 4:0:0
+
+inline Libgav1ImageFormat ComposeImageFormat(bool is_monochrome,
+                                             int8_t subsampling_x,
+                                             int8_t subsampling_y) {
+  Libgav1ImageFormat image_format;
+  if (subsampling_x == 0) {
+    assert(subsampling_y == 0 && !is_monochrome);
+    image_format = kLibgav1ImageFormatYuv444;
+  } else if (subsampling_y == 0) {
+    assert(!is_monochrome);
+    image_format = kLibgav1ImageFormatYuv422;
+  } else if (!is_monochrome) {
+    image_format = kLibgav1ImageFormatYuv420;
+  } else {
+    image_format = kLibgav1ImageFormatMonochrome400;
+  }
+  return image_format;
+}
+
+inline void DecomposeImageFormat(Libgav1ImageFormat image_format,
+                                 bool* is_monochrome, int8_t* subsampling_x,
+                                 int8_t* subsampling_y) {
+  *is_monochrome = false;
+  *subsampling_x = 1;
+  *subsampling_y = 1;
+  switch (image_format) {
+    case kLibgav1ImageFormatYuv420:
+      break;
+    case kLibgav1ImageFormatYuv422:
+      *subsampling_y = 0;
+      break;
+    case kLibgav1ImageFormatYuv444:
+      *subsampling_x = *subsampling_y = 0;
+      break;
+    default:
+      assert(image_format == kLibgav1ImageFormatMonochrome400);
+      *is_monochrome = true;
+      break;
+  }
+}
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
diff --git a/src/frame_scratch_buffer.h b/src/frame_scratch_buffer.h
new file mode 100644 (file)
index 0000000..1b0d2e0
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+
+#include <array>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <new>
+#include <utility>
+
+#include "src/loop_restoration_info.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/threading_strategy.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Buffer used to store the unfiltered pixels that are necessary for decoding
+// the next superblock row (for the intra prediction process).
+using IntraPredictionBuffer =
+    std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>;
+
+// Buffer to facilitate decoding a frame. This struct is used only within
+// DecoderImpl::DecodeTiles().
+// The alignment requirement is due to the SymbolDecoderContext member
+// symbol_decoder_context and the TileScratchBufferPool member
+// tile_scratch_buffer_pool.
+struct FrameScratchBuffer : public MaxAlignedAllocable {
+  LoopRestorationInfo loop_restoration_info;
+  Array2D<int8_t> cdef_index;
+  // Encodes the block skip information as a bitmask for the entire frame which
+  // will be used by the cdef process.
+  //
+  // * The size of this array is rows4x4 / 2 * column4x4 / 16.
+  // * Each row of the bitmasks array (cdef_skip) stores the bitmask for 2 rows
+  // of 4x4 blocks.
+  // * Each entry in the row will store the skip information for 16 4x4 blocks
+  // (8 bits).
+  // * If any of the four 4x4 blocks in the 8x8 block is not a skip block, then
+  // the corresponding bit (as described below) will be set to 1.
+  // * For the 4x4 block at column4x4 the bit index is (column4x4 >> 1).
+  Array2D<uint8_t> cdef_skip;
+  Array2D<TransformSize> inter_transform_sizes;
+  BlockParametersHolder block_parameters_holder;
+  TemporalMotionField motion_field;
+  SymbolDecoderContext symbol_decoder_context;
+  std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
+  // Buffer used to store the cdef borders. This buffer will store 4 rows for
+  // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The
+  // indices of the rows that are stored are specified in |kCdefBorderRows|.
+  YuvBuffer cdef_border;
+  AlignedDynamicBuffer<uint8_t, 16> superres_coefficients[kNumPlaneTypes];
+  // Buffer used to temporarily store the input row for applying SuperRes.
+  YuvBuffer superres_line_buffer;
+  // Buffer used to store the loop restoration borders. This buffer will store 4
+  // rows for every 64x64 block (4 rows for every 32x32 for chroma with
+  // subsampling). The indices of the rows that are stored are specified in
+  // |kLoopRestorationBorderRows|.
+  YuvBuffer loop_restoration_border;
+  // The size of this dynamic buffer is |tile_rows|.
+  DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
+  TileScratchBufferPool tile_scratch_buffer_pool;
+  ThreadingStrategy threading_strategy;
+  std::mutex superblock_row_mutex;
+  // The size of this buffer is the number of superblock rows.
+  // |superblock_row_progress[i]| is incremented whenever a tile finishes
+  // decoding superblock row at index i. If the count reaches tile_columns, then
+  // |superblock_row_progress_condvar[i]| is notified.
+  DynamicBuffer<int> superblock_row_progress
+      LIBGAV1_GUARDED_BY(superblock_row_mutex);
+  // The size of this buffer is the number of superblock rows. Used to wait for
+  // |superblock_row_progress[i]| to reach tile_columns.
+  DynamicBuffer<std::condition_variable> superblock_row_progress_condvar;
+  // Used to signal tile decoding failure in the combined multithreading mode.
+  bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex);
+};
+
+class FrameScratchBufferPool {
+ public:
+  std::unique_ptr<FrameScratchBuffer> Get() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (!buffers_.Empty()) {
+      return buffers_.Pop();
+    }
+    lock.unlock();
+    std::unique_ptr<FrameScratchBuffer> scratch_buffer(new (std::nothrow)
+                                                           FrameScratchBuffer);
+    return scratch_buffer;
+  }
+
+  void Release(std::unique_ptr<FrameScratchBuffer> scratch_buffer) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffers_.Push(std::move(scratch_buffer));
+  }
+
+ private:
+  std::mutex mutex_;
+  Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_
+      LIBGAV1_GUARDED_BY(mutex_);
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
diff --git a/src/gav1/decoder.h b/src/gav1/decoder.h
new file mode 100644 (file)
index 0000000..da08da9
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_H_
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+// IWYU pragma: begin_exports
+#include "gav1/decoder_buffer.h"
+#include "gav1/decoder_settings.h"
+#include "gav1/frame_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+#include "gav1/version.h"
+// IWYU pragma: end_exports
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct Libgav1Decoder;
+typedef struct Libgav1Decoder Libgav1Decoder;
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderCreate(
+    const Libgav1DecoderSettings* settings, Libgav1Decoder** decoder_out);
+
+LIBGAV1_PUBLIC void Libgav1DecoderDestroy(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderEnqueueFrame(
+    Libgav1Decoder* decoder, const uint8_t* data, size_t size,
+    int64_t user_private_data, void* buffer_private_data);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderDequeueFrame(
+    Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr);
+
+LIBGAV1_PUBLIC Libgav1StatusCode
+Libgav1DecoderSignalEOS(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC int Libgav1DecoderGetMaxBitdepth(void);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Forward declaration.
+class DecoderImpl;
+
+class LIBGAV1_PUBLIC Decoder {
+ public:
+  Decoder();
+  ~Decoder();
+
+  // Init must be called exactly once per instance. Subsequent calls will do
+  // nothing. If |settings| is nullptr, the decoder will be initialized with
+  // default settings. Returns kStatusOk on success, an error status otherwise.
+  StatusCode Init(const DecoderSettings* settings);
+
+  // Enqueues a compressed frame to be decoded.
+  //
+  // This function returns:
+  //   * kStatusOk on success
+  //   * kStatusTryAgain if the decoder queue is full
+  //   * an error status otherwise.
+  //
+  // |user_private_data| may be used to associate application specific private
+  // data with the compressed frame. It will be copied to the user_private_data
+  // field of the DecoderBuffer returned by the corresponding |DequeueFrame()|
+  // call.
+  //
+  // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
+  // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
+  // alive until:
+  // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer
+  // must be kept alive until release_input_buffer is called with the
+  // |buffer_private_data| passed into this EnqueueFrame call.
+  // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must
+  // be kept alive until the corresponding DequeueFrame() call is completed.
+  //
+  // If the call to |EnqueueFrame()| is not successful, then libgav1 will not
+  // hold any references to the |data| buffer. |settings_.release_input_buffer|
+  // callback will not be called in that case.
+  StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+                          int64_t user_private_data, void* buffer_private_data);
+
+  // Dequeues a decompressed frame. If there are enqueued compressed frames,
+  // decodes one and sets |*out_ptr| to the last displayable frame in the
+  // compressed frame. If there are no displayable frames available, sets
+  // |*out_ptr| to nullptr.
+  //
+  // Returns kStatusOk on success. Returns kStatusNothingToDequeue if there are
+  // no enqueued frames (in this case out_ptr will always be set to nullptr).
+  // Returns one of the other error statuses if there is an error.
+  //
+  // If |settings_.blocking_dequeue| is false and the decoder is operating in
+  // frame parallel mode (|settings_.frame_parallel| is true and the video
+  // stream passes the decoder's heuristics for enabling frame parallel mode),
+  // then this call will return kStatusTryAgain if an enqueued frame is not yet
+  // decoded (it is a non blocking call in this case). In all other cases, this
+  // call will block until an enqueued frame has been decoded.
+  StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+
+  // Signals the end of stream.
+  //
+  // In non-frame-parallel mode, this function will release all the frames held
+  // by the decoder. If the frame buffers were allocated by libgav1, then the
+  // pointer obtained by the prior DequeueFrame call will no longer be valid. If
+  // the frame buffers were allocated by the application, then any references
+  // that libgav1 is holding on to will be released.
+  //
+  // Once this function returns successfully, the decoder state will be reset
+  // and the decoder is ready to start decoding a new coded video sequence.
+  StatusCode SignalEOS();
+
+  // Returns the maximum bitdepth that is supported by this decoder.
+  static int GetMaxBitdepth();
+
+ private:
+  DecoderSettings settings_;
+  // The object is initialized if and only if impl_ != nullptr.
+  std::unique_ptr<DecoderImpl> impl_;
+};
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_DECODER_H_
diff --git a/src/gav1/decoder_buffer.h b/src/gav1/decoder_buffer.h
new file mode 100644 (file)
index 0000000..816eca4
--- /dev/null
@@ -0,0 +1,319 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+// The documentation for the enum values in this file can be found in Section
+// 6.4.2 of the AV1 spec.
+
+typedef enum Libgav1ChromaSamplePosition {
+  kLibgav1ChromaSamplePositionUnknown,
+  kLibgav1ChromaSamplePositionVertical,
+  kLibgav1ChromaSamplePositionColocated,
+  kLibgav1ChromaSamplePositionReserved
+} Libgav1ChromaSamplePosition;
+
+typedef enum Libgav1ImageFormat {
+  kLibgav1ImageFormatYuv420,
+  kLibgav1ImageFormatYuv422,
+  kLibgav1ImageFormatYuv444,
+  kLibgav1ImageFormatMonochrome400
+} Libgav1ImageFormat;
+
+typedef enum Libgav1ColorPrimary {
+  // 0 is reserved.
+  kLibgav1ColorPrimaryBt709 = 1,
+  kLibgav1ColorPrimaryUnspecified,
+  // 3 is reserved.
+  kLibgav1ColorPrimaryBt470M = 4,
+  kLibgav1ColorPrimaryBt470Bg,
+  kLibgav1ColorPrimaryBt601,
+  kLibgav1ColorPrimarySmpte240,
+  kLibgav1ColorPrimaryGenericFilm,
+  kLibgav1ColorPrimaryBt2020,
+  kLibgav1ColorPrimaryXyz,
+  kLibgav1ColorPrimarySmpte431,
+  kLibgav1ColorPrimarySmpte432,
+  // 13-21 are reserved.
+  kLibgav1ColorPrimaryEbu3213 = 22,
+  // 23-254 are reserved.
+  kLibgav1MaxColorPrimaries = 255
+} Libgav1ColorPrimary;
+
+typedef enum Libgav1TransferCharacteristics {
+  // 0 is reserved.
+  kLibgav1TransferCharacteristicsBt709 = 1,
+  kLibgav1TransferCharacteristicsUnspecified,
+  // 3 is reserved.
+  kLibgav1TransferCharacteristicsBt470M = 4,
+  kLibgav1TransferCharacteristicsBt470Bg,
+  kLibgav1TransferCharacteristicsBt601,
+  kLibgav1TransferCharacteristicsSmpte240,
+  kLibgav1TransferCharacteristicsLinear,
+  kLibgav1TransferCharacteristicsLog100,
+  kLibgav1TransferCharacteristicsLog100Sqrt10,
+  kLibgav1TransferCharacteristicsIec61966,
+  kLibgav1TransferCharacteristicsBt1361,
+  kLibgav1TransferCharacteristicsSrgb,
+  kLibgav1TransferCharacteristicsBt2020TenBit,
+  kLibgav1TransferCharacteristicsBt2020TwelveBit,
+  kLibgav1TransferCharacteristicsSmpte2084,
+  kLibgav1TransferCharacteristicsSmpte428,
+  kLibgav1TransferCharacteristicsHlg,
+  // 19-254 are reserved.
+  kLibgav1MaxTransferCharacteristics = 255
+} Libgav1TransferCharacteristics;
+
+typedef enum Libgav1MatrixCoefficients {
+  kLibgav1MatrixCoefficientsIdentity,
+  kLibgav1MatrixCoefficientsBt709,
+  kLibgav1MatrixCoefficientsUnspecified,
+  // 3 is reserved.
+  kLibgav1MatrixCoefficientsFcc = 4,
+  kLibgav1MatrixCoefficientsBt470BG,
+  kLibgav1MatrixCoefficientsBt601,
+  kLibgav1MatrixCoefficientsSmpte240,
+  kLibgav1MatrixCoefficientsSmpteYcgco,
+  kLibgav1MatrixCoefficientsBt2020Ncl,
+  kLibgav1MatrixCoefficientsBt2020Cl,
+  kLibgav1MatrixCoefficientsSmpte2085,
+  kLibgav1MatrixCoefficientsChromatNcl,
+  kLibgav1MatrixCoefficientsChromatCl,
+  kLibgav1MatrixCoefficientsIctcp,
+  // 15-254 are reserved.
+  kLibgav1MaxMatrixCoefficients = 255
+} Libgav1MatrixCoefficients;
+
+typedef enum Libgav1ColorRange {
+  // The color ranges are scaled by value << (bitdepth - 8) for 10 and 12bit
+  // streams.
+  kLibgav1ColorRangeStudio,  // Y [16..235], UV [16..240]
+  kLibgav1ColorRangeFull     // YUV/RGB [0..255]
+} Libgav1ColorRange;
+
+// Section 6.7.3.
+typedef struct Libgav1ObuMetadataHdrCll {  // NOLINT
+  uint16_t max_cll;                        // Maximum content light level.
+  uint16_t max_fall;                       // Maximum frame-average light level.
+} Libgav1ObuMetadataHdrCll;
+
+// Section 6.7.4.
+typedef struct Libgav1ObuMetadataHdrMdcv {  // NOLINT
+  // 0.16 fixed-point X/Y chromaticity coordinate as defined by CIE 1931 in
+  // R/G/B order.
+  uint16_t primary_chromaticity_x[3];
+  uint16_t primary_chromaticity_y[3];
+  // 0.16 fixed-point X/Y chromaticity coordinate as defined by CIE 1931.
+  uint16_t white_point_chromaticity_x;
+  uint16_t white_point_chromaticity_y;
+  // 24.8 fixed-point maximum luminance, represented in candelas per square
+  // meter.
+  uint32_t luminance_max;
+  // 18.14 fixed-point minimum luminance, represented in candelas per square
+  // meter.
+  uint32_t luminance_min;
+} Libgav1ObuMetadataHdrMdcv;
+
+// Section 6.7.2.
+typedef struct Libgav1ObuMetadataItutT35 {  // NOLINT
+  uint8_t country_code;
+  uint8_t country_code_extension_byte;  // Valid if country_code is 0xFF.
+  uint8_t* payload_bytes;
+  int payload_size;
+} Libgav1ObuMetadataItutT35;
+
+typedef struct Libgav1DecoderBuffer {
+#if defined(__cplusplus)
+  LIBGAV1_PUBLIC int NumPlanes() const {
+    return (image_format == kLibgav1ImageFormatMonochrome400) ? 1 : 3;
+  }
+#endif  // defined(__cplusplus)
+
+  Libgav1ChromaSamplePosition chroma_sample_position;
+  Libgav1ImageFormat image_format;
+  Libgav1ColorRange color_range;
+  Libgav1ColorPrimary color_primary;
+  Libgav1TransferCharacteristics transfer_characteristics;
+  Libgav1MatrixCoefficients matrix_coefficients;
+
+  int bitdepth;  // Stored image bitdepth.
+
+  // Image display dimensions in Y/U/V order.
+  int displayed_width[3];   // Displayed image width.
+  int displayed_height[3];  // Displayed image height.
+
+  // Values are given in Y/U/V order.
+  int stride[3];      // The width in bytes of one row of the |plane| buffer.
+                      // This may include padding bytes for alignment or
+                      // internal use by the decoder.
+  uint8_t* plane[3];  // The reconstructed image plane(s).
+
+  // Spatial id of this frame.
+  int spatial_id;
+  // Temporal id of this frame.
+  int temporal_id;
+
+  Libgav1ObuMetadataHdrCll hdr_cll;
+  int has_hdr_cll;  // 1 if the values in hdr_cll are valid for this frame. 0
+                    // otherwise.
+
+  Libgav1ObuMetadataHdrMdcv hdr_mdcv;
+  int has_hdr_mdcv;  // 1 if the values in hdr_mdcv are valid for this frame. 0
+                     // otherwise.
+
+  Libgav1ObuMetadataItutT35 itut_t35;
+  int has_itut_t35;  // 1 if the values in itut_t35 are valid for this frame. 0
+                     // otherwise.
+
+  // The |user_private_data| argument passed to Decoder::EnqueueFrame().
+  int64_t user_private_data;
+  // The |private_data| field of FrameBuffer. Set by the get frame buffer
+  // callback when it allocates a frame buffer.
+  void* buffer_private_data;
+} Libgav1DecoderBuffer;
+
+#if defined(__cplusplus)
+namespace libgav1 {
+
+using ChromaSamplePosition = Libgav1ChromaSamplePosition;
+constexpr ChromaSamplePosition kChromaSamplePositionUnknown =
+    kLibgav1ChromaSamplePositionUnknown;
+constexpr ChromaSamplePosition kChromaSamplePositionVertical =
+    kLibgav1ChromaSamplePositionVertical;
+constexpr ChromaSamplePosition kChromaSamplePositionColocated =
+    kLibgav1ChromaSamplePositionColocated;
+constexpr ChromaSamplePosition kChromaSamplePositionReserved =
+    kLibgav1ChromaSamplePositionReserved;
+
+using ImageFormat = Libgav1ImageFormat;
+constexpr ImageFormat kImageFormatYuv420 = kLibgav1ImageFormatYuv420;
+constexpr ImageFormat kImageFormatYuv422 = kLibgav1ImageFormatYuv422;
+constexpr ImageFormat kImageFormatYuv444 = kLibgav1ImageFormatYuv444;
+constexpr ImageFormat kImageFormatMonochrome400 =
+    kLibgav1ImageFormatMonochrome400;
+
+using ColorPrimary = Libgav1ColorPrimary;
+constexpr ColorPrimary kColorPrimaryBt709 = kLibgav1ColorPrimaryBt709;
+constexpr ColorPrimary kColorPrimaryUnspecified =
+    kLibgav1ColorPrimaryUnspecified;
+constexpr ColorPrimary kColorPrimaryBt470M = kLibgav1ColorPrimaryBt470M;
+constexpr ColorPrimary kColorPrimaryBt470Bg = kLibgav1ColorPrimaryBt470Bg;
+constexpr ColorPrimary kColorPrimaryBt601 = kLibgav1ColorPrimaryBt601;
+constexpr ColorPrimary kColorPrimarySmpte240 = kLibgav1ColorPrimarySmpte240;
+constexpr ColorPrimary kColorPrimaryGenericFilm =
+    kLibgav1ColorPrimaryGenericFilm;
+constexpr ColorPrimary kColorPrimaryBt2020 = kLibgav1ColorPrimaryBt2020;
+constexpr ColorPrimary kColorPrimaryXyz = kLibgav1ColorPrimaryXyz;
+constexpr ColorPrimary kColorPrimarySmpte431 = kLibgav1ColorPrimarySmpte431;
+constexpr ColorPrimary kColorPrimarySmpte432 = kLibgav1ColorPrimarySmpte432;
+constexpr ColorPrimary kColorPrimaryEbu3213 = kLibgav1ColorPrimaryEbu3213;
+constexpr ColorPrimary kMaxColorPrimaries = kLibgav1MaxColorPrimaries;
+
+using TransferCharacteristics = Libgav1TransferCharacteristics;
+constexpr TransferCharacteristics kTransferCharacteristicsBt709 =
+    kLibgav1TransferCharacteristicsBt709;
+constexpr TransferCharacteristics kTransferCharacteristicsUnspecified =
+    kLibgav1TransferCharacteristicsUnspecified;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470M =
+    kLibgav1TransferCharacteristicsBt470M;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470Bg =
+    kLibgav1TransferCharacteristicsBt470Bg;
+constexpr TransferCharacteristics kTransferCharacteristicsBt601 =
+    kLibgav1TransferCharacteristicsBt601;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte240 =
+    kLibgav1TransferCharacteristicsSmpte240;
+constexpr TransferCharacteristics kTransferCharacteristicsLinear =
+    kLibgav1TransferCharacteristicsLinear;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100 =
+    kLibgav1TransferCharacteristicsLog100;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100Sqrt10 =
+    kLibgav1TransferCharacteristicsLog100Sqrt10;
+constexpr TransferCharacteristics kTransferCharacteristicsIec61966 =
+    kLibgav1TransferCharacteristicsIec61966;
+constexpr TransferCharacteristics kTransferCharacteristicsBt1361 =
+    kLibgav1TransferCharacteristicsBt1361;
+constexpr TransferCharacteristics kTransferCharacteristicsSrgb =
+    kLibgav1TransferCharacteristicsSrgb;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TenBit =
+    kLibgav1TransferCharacteristicsBt2020TenBit;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TwelveBit =
+    kLibgav1TransferCharacteristicsBt2020TwelveBit;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte2084 =
+    kLibgav1TransferCharacteristicsSmpte2084;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte428 =
+    kLibgav1TransferCharacteristicsSmpte428;
+constexpr TransferCharacteristics kTransferCharacteristicsHlg =
+    kLibgav1TransferCharacteristicsHlg;
+constexpr TransferCharacteristics kMaxTransferCharacteristics =
+    kLibgav1MaxTransferCharacteristics;
+
+using MatrixCoefficients = Libgav1MatrixCoefficients;
+constexpr MatrixCoefficients kMatrixCoefficientsIdentity =
+    kLibgav1MatrixCoefficientsIdentity;
+constexpr MatrixCoefficients kMatrixCoefficientsBt709 =
+    kLibgav1MatrixCoefficientsBt709;
+constexpr MatrixCoefficients kMatrixCoefficientsUnspecified =
+    kLibgav1MatrixCoefficientsUnspecified;
+constexpr MatrixCoefficients kMatrixCoefficientsFcc =
+    kLibgav1MatrixCoefficientsFcc;
+constexpr MatrixCoefficients kMatrixCoefficientsBt470BG =
+    kLibgav1MatrixCoefficientsBt470BG;
+constexpr MatrixCoefficients kMatrixCoefficientsBt601 =
+    kLibgav1MatrixCoefficientsBt601;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte240 =
+    kLibgav1MatrixCoefficientsSmpte240;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpteYcgco =
+    kLibgav1MatrixCoefficientsSmpteYcgco;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Ncl =
+    kLibgav1MatrixCoefficientsBt2020Ncl;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Cl =
+    kLibgav1MatrixCoefficientsBt2020Cl;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte2085 =
+    kLibgav1MatrixCoefficientsSmpte2085;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatNcl =
+    kLibgav1MatrixCoefficientsChromatNcl;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatCl =
+    kLibgav1MatrixCoefficientsChromatCl;
+constexpr MatrixCoefficients kMatrixCoefficientsIctcp =
+    kLibgav1MatrixCoefficientsIctcp;
+constexpr MatrixCoefficients kMaxMatrixCoefficients =
+    kLibgav1MaxMatrixCoefficients;
+
+using ColorRange = Libgav1ColorRange;
+constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio;
+constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull;
+
+using ObuMetadataHdrCll = Libgav1ObuMetadataHdrCll;
+using ObuMetadataHdrMdcv = Libgav1ObuMetadataHdrMdcv;
+using ObuMetadataItutT35 = Libgav1ObuMetadataItutT35;
+
+using DecoderBuffer = Libgav1DecoderBuffer;
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h
new file mode 100644 (file)
index 0000000..7ee487f
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+#define LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/frame_buffer.h"
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This callback is invoked by the decoder when it is done using an input frame
+// buffer. When frame_parallel is set to true, this callback must not be
+// nullptr. Otherwise, this callback is optional.
+//
+// |buffer_private_data| is the value passed in the EnqueueFrame() call.
+typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data,
+                                                  void* buffer_private_data);
+
+typedef struct Libgav1DecoderSettings {
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
+  int threads;
+  // A boolean. Indicate to the decoder that frame parallel decoding is allowed.
+  // Note that this is just a request and the decoder will decide the number of
+  // frames to be decoded in parallel based on the video stream being decoded.
+  int frame_parallel;
+  // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait
+  // until a enqueued frame is available for dequeueing.
+  //
+  // If frame_parallel is 0, this setting is ignored.
+  int blocking_dequeue;
+  // Called when the first sequence header or a sequence header with a
+  // different frame size (which includes bitdepth, monochrome, subsampling_x,
+  // subsampling_y, maximum frame width, or maximum frame height) is received.
+  Libgav1FrameBufferSizeChangedCallback on_frame_buffer_size_changed;
+  // Get frame buffer callback.
+  Libgav1GetFrameBufferCallback get_frame_buffer;
+  // Release frame buffer callback.
+  Libgav1ReleaseFrameBufferCallback release_frame_buffer;
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
+  Libgav1ReleaseInputBufferCallback release_input_buffer;
+  // Passed as the private_data argument to the callbacks.
+  void* callback_private_data;
+  // A boolean. If set to 1, the decoder will output all the spatial and
+  // temporal layers.
+  int output_all_layers;
+  // Index of the operating point to decode.
+  int operating_point;
+  // Mask indicating the post processing filters that need to be applied to the
+  // reconstructed frame. Note this is an advanced setting and does not
+  // typically need to be changed.
+  // From LSB:
+  //   Bit 0: Loop filter (deblocking filter).
+  //   Bit 1: Cdef.
+  //   Bit 2: SuperRes.
+  //   Bit 3: Loop restoration.
+  //   Bit 4: Film grain synthesis.
+  //   All the bits other than the last 5 are ignored.
+  uint8_t post_filter_mask;
+} Libgav1DecoderSettings;
+
+LIBGAV1_PUBLIC void Libgav1DecoderSettingsInitDefault(
+    Libgav1DecoderSettings* settings);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback;
+
+// Applications must populate this structure before creating a decoder instance.
+struct DecoderSettings {
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
+  int threads = 1;
+  // Indicate to the decoder that frame parallel decoding is allowed. Note that
+  // this is just a request and the decoder will decide the number of frames to
+  // be decoded in parallel based on the video stream being decoded.
+  bool frame_parallel = false;
+  // In frame parallel mode, should DequeueFrame wait until a enqueued frame is
+  // available for dequeueing.
+  //
+  // If frame_parallel is false, this setting is ignored.
+  bool blocking_dequeue = false;
+  // Called when the first sequence header or a sequence header with a
+  // different frame size (which includes bitdepth, monochrome, subsampling_x,
+  // subsampling_y, maximum frame width, or maximum frame height) is received.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+  // Get frame buffer callback.
+  GetFrameBufferCallback get_frame_buffer = nullptr;
+  // Release frame buffer callback.
+  ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
+  ReleaseInputBufferCallback release_input_buffer = nullptr;
+  // Passed as the private_data argument to the callbacks.
+  void* callback_private_data = nullptr;
+  // If set to true, the decoder will output all the spatial and temporal
+  // layers.
+  bool output_all_layers = false;
+  // Index of the operating point to decode.
+  int operating_point = 0;
+  // Mask indicating the post processing filters that need to be applied to the
+  // reconstructed frame. Note this is an advanced setting and does not
+  // typically need to be changed.
+  // From LSB:
+  //   Bit 0: Loop filter (deblocking filter).
+  //   Bit 1: Cdef.
+  //   Bit 2: SuperRes.
+  //   Bit 3: Loop restoration.
+  //   Bit 4: Film grain synthesis.
+  //   All the bits other than the last 5 are ignored.
+  uint8_t post_filter_mask = 0x1f;
+};
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+#endif  // LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
diff --git a/src/gav1/frame_buffer.h b/src/gav1/frame_buffer.h
new file mode 100644 (file)
index 0000000..8132b61
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/decoder_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+
+// The callback functions use the C linkage conventions.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This structure represents an allocated frame buffer.
+typedef struct Libgav1FrameBuffer {
+  // In the |plane| and |stride| arrays, the elements at indexes 0, 1, and 2
+  // are for the Y, U, and V planes, respectively.
+  uint8_t* plane[3];   // Pointers to the frame (excluding the borders) in the
+                       // data buffers.
+  int stride[3];       // Row strides in bytes.
+  void* private_data;  // Frame buffer's private data. Available for use by the
+                       // release frame buffer callback. Also copied to the
+                       // |buffer_private_data| field of DecoderBuffer for use
+                       // by the consumer of a DecoderBuffer.
+} Libgav1FrameBuffer;
+
+// This callback is invoked by the decoder to provide information on the
+// subsequent frames in the video, until the next invocation of this callback
+// or the end of the video.
+//
+// |width| and |height| are the maximum frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// maximum left, right, top, and bottom border sizes in pixels.
+// |stride_alignment| specifies the alignment of the row stride in bytes.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+//
+// NOTE: This callback may be omitted if the information is not useful to the
+// application.
+typedef Libgav1StatusCode (*Libgav1FrameBufferSizeChangedCallback)(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment);
+
+// This callback is invoked by the decoder to allocate a frame buffer, which
+// consists of three data buffers, for the Y, U, and V planes, respectively.
+//
+// The callback must set |frame_buffer->plane[i]| to point to the data buffers
+// of the planes, and set |frame_buffer->stride[i]| to the row strides of the
+// planes. If |image_format| is kLibgav1ImageFormatMonochrome400, the callback
+// should set |frame_buffer->plane[1]| and |frame_buffer->plane[2]| to a null
+// pointer and set |frame_buffer->stride[1]| and |frame_buffer->stride[2]| to
+// 0. The callback may set |frame_buffer->private_data| to a value that will
+// be useful to the release frame buffer callback and the consumer of a
+// DecoderBuffer.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+
+// |width| and |height| are the frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// left, right, top, and bottom border sizes in pixels. |stride_alignment|
+// specifies the alignment of the row stride in bytes.
+typedef Libgav1StatusCode (*Libgav1GetFrameBufferCallback)(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+// After a frame buffer is allocated, the decoder starts to write decoded video
+// to the frame buffer. When the frame buffer is ready for consumption, it is
+// made available to the application in a Decoder::DequeueFrame() call.
+// Afterwards, the decoder may continue to use the frame buffer in read-only
+// mode. When the decoder is finished using the frame buffer, it notifies the
+// application by calling the Libgav1ReleaseFrameBufferCallback.
+
+// This callback is invoked by the decoder to release a frame buffer.
+typedef void (*Libgav1ReleaseFrameBufferCallback)(void* callback_private_data,
+                                                  void* buffer_private_data);
+
+// Libgav1ComputeFrameBufferInfo() and Libgav1SetFrameBuffer() are intended to
+// help clients implement frame buffer callbacks using memory buffers. First,
+// call Libgav1ComputeFrameBufferInfo(). If it succeeds, allocate y_buffer of
+// size info.y_buffer_size and allocate u_buffer and v_buffer, both of size
+// info.uv_buffer_size. Finally, pass y_buffer, u_buffer, v_buffer, and
+// buffer_private_data to Libgav1SetFrameBuffer().
+
+// This structure contains information useful for allocating memory for a frame
+// buffer.
+typedef struct Libgav1FrameBufferInfo {
+  size_t y_buffer_size;   // Size in bytes of the Y buffer.
+  size_t uv_buffer_size;  // Size in bytes of the U or V buffer.
+
+  // The following fields are consumed by Libgav1SetFrameBuffer(). Do not use
+  // them directly.
+  int y_stride;            // Row stride in bytes of the Y buffer.
+  int uv_stride;           // Row stride in bytes of the U or V buffer.
+  size_t y_plane_offset;   // Offset in bytes of the frame (excluding the
+                           // borders) in the Y buffer.
+  size_t uv_plane_offset;  // Offset in bytes of the frame (excluding the
+                           // borders) in the U or V buffer.
+  int stride_alignment;    // The stride_alignment argument passed to
+                           // Libgav1ComputeFrameBufferInfo().
+} Libgav1FrameBufferInfo;
+
+// Computes the information useful for allocating memory for a frame buffer.
+// On success, stores the output in |info|.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBufferInfo* info);
+
+// Sets the |frame_buffer| struct.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1SetFrameBuffer(
+    const Libgav1FrameBufferInfo* info, uint8_t* y_buffer, uint8_t* u_buffer,
+    uint8_t* v_buffer, void* buffer_private_data,
+    Libgav1FrameBuffer* frame_buffer);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+// Declare type aliases for C++.
+namespace libgav1 {
+
+using FrameBuffer = Libgav1FrameBuffer;
+using FrameBufferSizeChangedCallback = Libgav1FrameBufferSizeChangedCallback;
+using GetFrameBufferCallback = Libgav1GetFrameBufferCallback;
+using ReleaseFrameBufferCallback = Libgav1ReleaseFrameBufferCallback;
+using FrameBufferInfo = Libgav1FrameBufferInfo;
+
+inline StatusCode ComputeFrameBufferInfo(int bitdepth, ImageFormat image_format,
+                                         int width, int height, int left_border,
+                                         int right_border, int top_border,
+                                         int bottom_border,
+                                         int stride_alignment,
+                                         FrameBufferInfo* info) {
+  return Libgav1ComputeFrameBufferInfo(bitdepth, image_format, width, height,
+                                       left_border, right_border, top_border,
+                                       bottom_border, stride_alignment, info);
+}
+
+inline StatusCode SetFrameBuffer(const FrameBufferInfo* info, uint8_t* y_buffer,
+                                 uint8_t* u_buffer, uint8_t* v_buffer,
+                                 void* buffer_private_data,
+                                 FrameBuffer* frame_buffer) {
+  return Libgav1SetFrameBuffer(info, y_buffer, u_buffer, v_buffer,
+                               buffer_private_data, frame_buffer);
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
diff --git a/src/gav1/status_code.h b/src/gav1/status_code.h
new file mode 100644 (file)
index 0000000..d7476ca
--- /dev/null
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+#define LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+// The Libgav1StatusCode enum type: A libgav1 function may return
+// Libgav1StatusCode to indicate success or the reason for failure.
+typedef enum {
+  // Success.
+  kLibgav1StatusOk = 0,
+
+  // An unknown error. Used as the default error status if error detail is not
+  // available.
+  kLibgav1StatusUnknownError = -1,
+
+  // An invalid function argument.
+  kLibgav1StatusInvalidArgument = -2,
+
+  // Memory allocation failure.
+  kLibgav1StatusOutOfMemory = -3,
+
+  // Ran out of a resource (other than memory).
+  kLibgav1StatusResourceExhausted = -4,
+
+  // The object is not initialized.
+  kLibgav1StatusNotInitialized = -5,
+
+  // An operation that can only be performed once has already been performed.
+  kLibgav1StatusAlready = -6,
+
+  // Not implemented, or not supported.
+  kLibgav1StatusUnimplemented = -7,
+
+  // An internal error in libgav1. Usually this indicates a programming error.
+  kLibgav1StatusInternalError = -8,
+
+  // The bitstream is not encoded correctly or violates a bitstream conformance
+  // requirement.
+  kLibgav1StatusBitstreamError = -9,
+
+  // The operation is not allowed at the moment. This is not a fatal error. Try
+  // again later.
+  kLibgav1StatusTryAgain = -10,
+
+  // Used only by DequeueFrame(). There are no enqueued frames, so there is
+  // nothing to dequeue. This is not a fatal error. Try enqueuing a frame before
+  // trying to dequeue again.
+  kLibgav1StatusNothingToDequeue = -11,
+
+  // An extra enumerator to prevent people from writing code that fails to
+  // compile when a new status code is added.
+  //
+  // Do not reference this enumerator. In particular, if you write code that
+  // switches on Libgav1StatusCode, add a default: case instead of a case that
+  // mentions this enumerator.
+  //
+  // Do not depend on the value (currently -1000) listed here. It may change in
+  // the future.
+  kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_ = -1000
+} Libgav1StatusCode;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetErrorString(Libgav1StatusCode status);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Declare type aliases for C++.
+using StatusCode = Libgav1StatusCode;
+constexpr StatusCode kStatusOk = kLibgav1StatusOk;
+constexpr StatusCode kStatusUnknownError = kLibgav1StatusUnknownError;
+constexpr StatusCode kStatusInvalidArgument = kLibgav1StatusInvalidArgument;
+constexpr StatusCode kStatusOutOfMemory = kLibgav1StatusOutOfMemory;
+constexpr StatusCode kStatusResourceExhausted = kLibgav1StatusResourceExhausted;
+constexpr StatusCode kStatusNotInitialized = kLibgav1StatusNotInitialized;
+constexpr StatusCode kStatusAlready = kLibgav1StatusAlready;
+constexpr StatusCode kStatusUnimplemented = kLibgav1StatusUnimplemented;
+constexpr StatusCode kStatusInternalError = kLibgav1StatusInternalError;
+constexpr StatusCode kStatusBitstreamError = kLibgav1StatusBitstreamError;
+constexpr StatusCode kStatusTryAgain = kLibgav1StatusTryAgain;
+constexpr StatusCode kStatusNothingToDequeue = kLibgav1StatusNothingToDequeue;
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+inline const char* GetErrorString(StatusCode status) {
+  return Libgav1GetErrorString(status);
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_STATUS_CODE_H_
diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h
new file mode 100644 (file)
index 0000000..116a514
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+#define LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+
+// This module defines the LIBGAV1_PUBLIC macro. LIBGAV1_PUBLIC, when combined
+// with the flags -fvisibility=hidden and -fvisibility-inlines-hidden, restricts
+// symbol availability when users use the shared object form of libgav1. The
+// intent is to prevent exposure of libgav1 internals to users of the library,
+// and to avoid ABI compatibility problems that changes to libgav1 internals
+// would cause for users of the libgav1 shared object.
+//
+// Examples:
+//
+// This form makes a class and all of its members part of the public API:
+//
+// class LIBGAV1_PUBLIC A {
+//  public:
+//   A();
+//   ~A();
+//   void Foo();
+//   int Bar();
+// };
+//
+// A::A(), A::~A(), A::Foo(), and A::Bar() are all available to code linking to
+// the shared object when this form is used.
+//
+// This form exposes a single class method as part of the public API:
+//
+// class B {
+//  public:
+//   B();
+//   ~B();
+//   LIBGAV1_PUBLIC int Foo();
+// };
+//
+// In this examples only B::Foo() is available to the user of the shared object.
+//
+// Non-class member functions can also be exposed individually:
+//
+// LIBGAV1_PUBLIC void Bar();
+//
+// In this example Bar() would be available to users of the shared object.
+//
+// Much of the above information and more can be found at
+// https://gcc.gnu.org/wiki/Visibility
+//
+// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the
+// compiler command line to override the definition of LIBGAV1_PUBLIC in this
+// header. This can be used to create a libgav1 static library that will not
+// export any symbols when it is linked into a shared library.
+
+#if !defined(LIBGAV1_PUBLIC)
+#if defined(_WIN32)
+#if defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#if defined(__GNUC__)
+#define LIBGAV1_PUBLIC __attribute__((dllexport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllexport)
+#endif  // defined(__GNUC__)
+#elif defined(LIBGAV1_BUILDING_DLL)
+#ifdef __GNUC__
+#define LIBGAV1_PUBLIC __attribute__((dllimport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllimport)
+#endif  // defined(__GNUC__)
+#else
+#define LIBGAV1_PUBLIC
+#endif  // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#else   // !defined(_WIN32)
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIBGAV1_PUBLIC __attribute__((visibility("default")))
+#else
+#define LIBGAV1_PUBLIC
+#endif
+#endif  // defined(_WIN32)
+#endif  // defined(LIBGAV1_PUBLIC)
+
+#endif  // LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
diff --git a/src/gav1/version.h b/src/gav1/version.h
new file mode 100644 (file)
index 0000000..cca2383
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_VERSION_H_
+#define LIBGAV1_SRC_GAV1_VERSION_H_
+
+#include "gav1/symbol_visibility.h"
+
+// This library follows the principles described by Semantic Versioning
+// (https://semver.org).
+
+#define LIBGAV1_MAJOR_VERSION 0
+#define LIBGAV1_MINOR_VERSION 19
+#define LIBGAV1_PATCH_VERSION 0
+
+#define LIBGAV1_VERSION                                           \
+  ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
+   LIBGAV1_PATCH_VERSION)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+LIBGAV1_PUBLIC int Libgav1GetVersion(void);
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetVersionString(void);
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetBuildConfiguration(void);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+inline int GetVersion() { return Libgav1GetVersion(); }
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+inline const char* GetVersionString() { return Libgav1GetVersionString(); }
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+inline const char* GetBuildConfiguration() {
+  return Libgav1GetBuildConfiguration();
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_VERSION_H_
diff --git a/src/inter_intra_masks.inc b/src/inter_intra_masks.inc
new file mode 100644 (file)
index 0000000..2c15f9c
--- /dev/null
@@ -0,0 +1,581 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the inter intra masks
+// from the code where it is used.
+
+// The tables in this file are computed based on section 7.11.3.13 in the spec.
+
+constexpr uint8_t kInterIntraMaskDc[] = {
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+constexpr uint8_t kInterIntraMaskVertical4x4[] = {
+    60, 60, 60, 60, 19, 19, 19, 19, 6, 6, 6, 6, 2, 2, 2, 2};
+constexpr uint8_t kInterIntraMaskVertical4x8[] = {
+    60, 60, 60, 60, 34, 34, 34, 34, 19, 19, 19, 19, 11, 11, 11, 11,
+    6,  6,  6,  6,  4,  4,  4,  4,  2,  2,  2,  2,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical8x4[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+    19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskVertical8x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+    19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11,
+    6,  6,  6,  6,  6,  6,  6,  6,  4,  4,  4,  4,  4,  4,  4,  4,
+    2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical8x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34,
+    34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19,
+    19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, 11, 11, 8,
+    8,  8,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,
+    5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,
+    3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical16x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8};
+constexpr uint8_t kInterIntraMaskVertical16x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical16x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical32x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7};
+constexpr uint8_t kInterIntraMaskVertical32x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+
+constexpr uint8_t kInterIntraMaskHorizontal4x4[] = {60, 19, 6, 2, 60, 19, 6, 2,
+                                                    60, 19, 6, 2, 60, 19, 6, 2};
+constexpr uint8_t kInterIntraMaskHorizontal4x8[] = {
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11,
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskHorizontal8x4[] = {
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x8[] = {
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x16[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34,
+    26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15,
+    11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60,
+    45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26,
+    19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11,
+    8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45,
+    34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskHorizontal16x8[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34,
+    26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15,
+    11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,
+    5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,
+    2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,
+    1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45,
+    34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal16x16[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34,
+    26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15,
+    11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,
+    5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,
+    2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,
+    1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45,
+    34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19,
+    15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,
+    6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,
+    3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,
+    1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60,
+    45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26,
+    19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11,
+    8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal16x32[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7};
+constexpr uint8_t kInterIntraMaskHorizontal32x16[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,
+    3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,
+    1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal32x32[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,
+    3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,
+    1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,
+    4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,
+    2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+
+constexpr uint8_t kInterIntraMaskSmooth4x4[] = {60, 60, 60, 60, 60, 19, 19, 19,
+                                                60, 19, 6,  6,  60, 19, 6,  2};
+constexpr uint8_t kInterIntraMaskSmooth4x8[] = {
+    60, 60, 60, 60, 60, 34, 34, 34, 60, 34, 19, 19, 60, 34, 19, 11,
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x4[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+    60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+    60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11,
+    60, 34, 19, 11, 6,  6,  6,  6,  60, 34, 19, 11, 6,  4,  4,  4,
+    60, 34, 19, 11, 6,  4,  2,  2,  60, 34, 19, 11, 6,  4,  2,  1};
+constexpr uint8_t kInterIntraMaskSmooth8x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34,
+    34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19,
+    19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 60, 45, 34, 26, 19, 15, 11, 11, 60,
+    45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26,
+    19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11,
+    8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45,
+    34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskSmooth16x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+    34, 26, 19, 15, 11, 8,  8,  8,  8,  8,  8,  8,  8,  8};
+constexpr uint8_t kInterIntraMaskSmooth16x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+    34, 26, 19, 15, 11, 8,  8,  8,  8,  8,  8,  8,  8,  8,  60, 45, 34, 26, 19,
+    15, 11, 8,  6,  6,  6,  6,  6,  6,  6,  6,  60, 45, 34, 26, 19, 15, 11, 8,
+    6,  5,  5,  5,  5,  5,  5,  5,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,
+    4,  4,  4,  4,  4,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  3,  3,
+    3,  3,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  2,  2,  60,
+    45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  2,  2,  60, 45, 34, 26,
+    19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11,
+    8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskSmooth16x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 52, 45, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52,
+    45, 39, 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34,
+    30, 26, 22, 19, 19, 19, 19, 19, 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 13, 13,
+    13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, 11, 60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  8,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7};
+constexpr uint8_t kInterIntraMaskSmooth32x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+    34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7};
+constexpr uint8_t kInterIntraMaskSmooth32x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+    34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+
+// For each 2D array within this array, the indices are mapped as follows: 0, 1,
+// 2 and 3 in each dimension maps to prediction dimension 4, 8, 16 and 32
+// respectively. For example, the entry in [1][2] corresponds to a prediction
+// size of 8x16 (width == 8 and height == 16).
+const uint8_t* kInterIntraMasks[kNumInterIntraModes][4][4] = {
+    // kInterIntraModeDc. This is a special case where all the non-nullptr
+    // entries point to kInterIntraMaskDc (all entries of the array are 32). The
+    // width can be set according to the prediction size to achieve the desired
+    // result.
+    {{kInterIntraMaskDc, kInterIntraMaskDc, nullptr, nullptr},
+     {kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc, nullptr},
+     {nullptr, kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc},
+     {nullptr, nullptr, kInterIntraMaskDc, kInterIntraMaskDc}},
+    // kInterIntraModeVertical
+    {{kInterIntraMaskVertical4x4, kInterIntraMaskVertical4x8, nullptr, nullptr},
+     {kInterIntraMaskVertical8x4, kInterIntraMaskVertical8x8,
+      kInterIntraMaskVertical8x16, nullptr},
+     {nullptr, kInterIntraMaskVertical16x8, kInterIntraMaskVertical16x16,
+      kInterIntraMaskVertical16x32},
+     {nullptr, nullptr, kInterIntraMaskVertical32x16,
+      kInterIntraMaskVertical32x32}},
+    // kInterIntraModeHorizontal
+    {{kInterIntraMaskHorizontal4x4, kInterIntraMaskHorizontal4x8, nullptr,
+      nullptr},
+     {kInterIntraMaskHorizontal8x4, kInterIntraMaskHorizontal8x8,
+      kInterIntraMaskHorizontal8x16, nullptr},
+     {nullptr, kInterIntraMaskHorizontal16x8, kInterIntraMaskHorizontal16x16,
+      kInterIntraMaskHorizontal16x32},
+     {nullptr, nullptr, kInterIntraMaskHorizontal32x16,
+      kInterIntraMaskHorizontal32x32}},
+    // kInterIntraModeSmooth
+    {{kInterIntraMaskSmooth4x4, kInterIntraMaskSmooth4x8, nullptr, nullptr},
+     {kInterIntraMaskSmooth8x4, kInterIntraMaskSmooth8x8,
+      kInterIntraMaskSmooth8x16, nullptr},
+     {nullptr, kInterIntraMaskSmooth16x8, kInterIntraMaskSmooth16x16,
+      kInterIntraMaskSmooth16x32},
+     {nullptr, nullptr, kInterIntraMaskSmooth32x16,
+      kInterIntraMaskSmooth32x32}}};
diff --git a/src/internal_frame_buffer_list.cc b/src/internal_frame_buffer_list.cc
new file mode 100644 (file)
index 0000000..e2d2273
--- /dev/null
@@ -0,0 +1,122 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+extern "C" {
+
+Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  return buffer_list->OnFrameBufferSizeChanged(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment);
+}
+
+Libgav1StatusCode GetInternalFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  return buffer_list->GetFrameBuffer(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void ReleaseInternalFrameBuffer(void* callback_private_data,
+                                void* buffer_private_data) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  buffer_list->ReleaseFrameBuffer(buffer_private_data);
+}
+
+}  // extern "C"
+
+StatusCode InternalFrameBufferList::OnFrameBufferSizeChanged(
+    int /*bitdepth*/, Libgav1ImageFormat /*image_format*/, int /*width*/,
+    int /*height*/, int /*left_border*/, int /*right_border*/,
+    int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/) {
+  return kStatusOk;
+}
+
+StatusCode InternalFrameBufferList::GetFrameBuffer(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  FrameBufferInfo info;
+  StatusCode status = ComputeFrameBufferInfo(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, &info);
+  if (status != kStatusOk) return status;
+
+  if (info.uv_buffer_size > SIZE_MAX / 2 ||
+      info.y_buffer_size > SIZE_MAX - 2 * info.uv_buffer_size) {
+    return kStatusInvalidArgument;
+  }
+  const size_t min_size = info.y_buffer_size + 2 * info.uv_buffer_size;
+
+  Buffer* buffer = nullptr;
+  for (auto& buffer_ptr : buffers_) {
+    if (!buffer_ptr->in_use) {
+      buffer = buffer_ptr.get();
+      break;
+    }
+  }
+  if (buffer == nullptr) {
+    std::unique_ptr<Buffer> new_buffer(new (std::nothrow) Buffer);
+    if (new_buffer == nullptr || !buffers_.push_back(std::move(new_buffer))) {
+      return kStatusOutOfMemory;
+    }
+    buffer = buffers_.back().get();
+  }
+
+  if (buffer->size < min_size) {
+    std::unique_ptr<uint8_t[], MallocDeleter> new_data(
+        static_cast<uint8_t*>(malloc(min_size)));
+    if (new_data == nullptr) return kStatusOutOfMemory;
+    buffer->data = std::move(new_data);
+    buffer->size = min_size;
+  }
+
+  uint8_t* const y_buffer = buffer->data.get();
+  uint8_t* const u_buffer =
+      (info.uv_buffer_size == 0) ? nullptr : y_buffer + info.y_buffer_size;
+  uint8_t* const v_buffer =
+      (info.uv_buffer_size == 0) ? nullptr : u_buffer + info.uv_buffer_size;
+  status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer, buffer,
+                                 frame_buffer);
+  if (status != kStatusOk) return status;
+  buffer->in_use = true;
+  return kStatusOk;
+}
+
+void InternalFrameBufferList::ReleaseFrameBuffer(void* buffer_private_data) {
+  auto* const buffer = static_cast<Buffer*>(buffer_private_data);
+  buffer->in_use = false;
+}
+
+}  // namespace libgav1
diff --git a/src/internal_frame_buffer_list.h b/src/internal_frame_buffer_list.h
new file mode 100644 (file)
index 0000000..1c50b48
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+#define LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+extern "C" Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment);
+
+extern "C" Libgav1StatusCode GetInternalFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+extern "C" void ReleaseInternalFrameBuffer(void* callback_private_data,
+                                           void* buffer_private_data);
+
+class InternalFrameBufferList : public Allocable {
+ public:
+  InternalFrameBufferList() = default;
+
+  // Not copyable or movable.
+  InternalFrameBufferList(const InternalFrameBufferList&) = delete;
+  InternalFrameBufferList& operator=(const InternalFrameBufferList&) = delete;
+
+  ~InternalFrameBufferList() = default;
+
+  Libgav1StatusCode OnFrameBufferSizeChanged(int bitdepth,
+                                             Libgav1ImageFormat image_format,
+                                             int width, int height,
+                                             int left_border, int right_border,
+                                             int top_border, int bottom_border,
+                                             int stride_alignment);
+
+  Libgav1StatusCode GetFrameBuffer(int bitdepth,
+                                   Libgav1ImageFormat image_format, int width,
+                                   int height, int left_border,
+                                   int right_border, int top_border,
+                                   int bottom_border, int stride_alignment,
+                                   Libgav1FrameBuffer* frame_buffer);
+
+  void ReleaseFrameBuffer(void* buffer_private_data);
+
+ private:
+  struct Buffer : public Allocable {
+    std::unique_ptr<uint8_t[], MallocDeleter> data;
+    size_t size = 0;
+    bool in_use = false;
+  };
+
+  Vector<std::unique_ptr<Buffer>> buffers_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
diff --git a/src/internal_frame_buffer_list_test.cc b/src/internal_frame_buffer_list_test.cc
new file mode 100644 (file)
index 0000000..21f1162
--- /dev/null
@@ -0,0 +1,158 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+class InternalFrameBufferListTest : public testing::Test {
+ protected:
+  static constexpr int kBufferListSize = 10;
+
+  InternalFrameBufferListTest() {
+    on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer_ = GetInternalFrameBuffer;
+    release_frame_buffer_ = ReleaseInternalFrameBuffer;
+    callback_private_data_ = &buffer_list_;
+  }
+
+  // Frame buffer callbacks.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+  GetFrameBufferCallback get_frame_buffer_;
+  ReleaseFrameBufferCallback release_frame_buffer_;
+  // Private data associated with the frame buffer callbacks.
+  void* callback_private_data_;
+
+ private:
+  InternalFrameBufferList buffer_list_;
+};
+
+TEST_F(InternalFrameBufferListTest, ReleaseInRandomOrder) {
+  const int bitdepth = 8;
+  const Libgav1ImageFormat image_format = kLibgav1ImageFormatYuv420;
+  const int width = 100;
+  const int height = 50;
+  const int left_border = 0;
+  const int right_border = 0;
+  const int top_border = 0;
+  const int bottom_border = 0;
+  const int stride_alignment = 16;
+
+  EXPECT_EQ(on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                          image_format, width, height,
+                                          left_border, right_border, top_border,
+                                          bottom_border, stride_alignment),
+            0);
+
+  FrameBuffer frame_buffers[kBufferListSize];
+  for (auto& frame_buffer : frame_buffers) {
+    EXPECT_EQ(
+        get_frame_buffer_(callback_private_data_, bitdepth, image_format, width,
+                          height, left_border, right_border, top_border,
+                          bottom_border, stride_alignment, &frame_buffer),
+        0);
+    EXPECT_NE(frame_buffer.plane[0], nullptr);
+    EXPECT_GE(frame_buffer.stride[0], 112);
+    EXPECT_NE(frame_buffer.plane[1], nullptr);
+    EXPECT_GE(frame_buffer.stride[1], 64);
+    EXPECT_NE(frame_buffer.plane[2], nullptr);
+    EXPECT_GE(frame_buffer.stride[2], 64);
+  }
+
+  // Release and get a few buffers at indexes <= 5 in random order.
+  static_assert(5 < kBufferListSize, "");
+  static constexpr int indexes[] = {1, 4, 5, 5, 4, 3, 2, 3, 5, 0};
+  for (int index : indexes) {
+    release_frame_buffer_(callback_private_data_,
+                          frame_buffers[index].private_data);
+
+    EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+                                width, height, left_border, right_border,
+                                top_border, bottom_border, stride_alignment,
+                                &frame_buffers[index]),
+              0);
+    EXPECT_NE(frame_buffers[index].plane[0], nullptr);
+    EXPECT_GE(frame_buffers[index].stride[0], 112);
+    EXPECT_NE(frame_buffers[index].plane[1], nullptr);
+    EXPECT_GE(frame_buffers[index].stride[1], 64);
+    EXPECT_NE(frame_buffers[index].plane[2], nullptr);
+    EXPECT_GE(frame_buffers[index].stride[2], 64);
+  }
+
+  for (auto& frame_buffer : frame_buffers) {
+    release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+  }
+}
+
+TEST_F(InternalFrameBufferListTest, VaryingBufferSizes) {
+  const int bitdepth = 8;
+  const Libgav1ImageFormat image_format = kLibgav1ImageFormatYuv420;
+  const int width = 64;
+  const int height = 48;
+  const int left_border = 16;
+  const int right_border = 16;
+  const int top_border = 16;
+  const int bottom_border = 16;
+  const int stride_alignment = 16;
+
+  EXPECT_EQ(on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                          image_format, 16 * width, 16 * height,
+                                          left_border, right_border, top_border,
+                                          bottom_border, stride_alignment),
+            0);
+
+  FrameBuffer frame_buffer;
+  for (int i = 1; i <= 16; ++i) {
+    EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+                                i * width, i * height, left_border,
+                                right_border, top_border, bottom_border,
+                                stride_alignment, &frame_buffer),
+              0);
+    EXPECT_NE(frame_buffer.plane[0], nullptr);
+    EXPECT_GE(frame_buffer.stride[0], i * width + left_border + right_border);
+    EXPECT_NE(frame_buffer.plane[1], nullptr);
+    EXPECT_GE(frame_buffer.stride[1],
+              (i * width + left_border + right_border) >> 1);
+    EXPECT_NE(frame_buffer.plane[2], nullptr);
+    EXPECT_GE(frame_buffer.stride[2],
+              (i * width + left_border + right_border) >> 1);
+    release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+  }
+  for (int i = 16; i >= 1; --i) {
+    EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+                                i * width, i * height, left_border,
+                                right_border, top_border, bottom_border,
+                                stride_alignment, &frame_buffer),
+              0);
+    EXPECT_NE(frame_buffer.plane[0], nullptr);
+    EXPECT_GE(frame_buffer.stride[0], i * width + left_border + right_border);
+    EXPECT_NE(frame_buffer.plane[1], nullptr);
+    EXPECT_GE(frame_buffer.stride[1],
+              (i * width + left_border + right_border) >> 1);
+    EXPECT_NE(frame_buffer.plane[2], nullptr);
+    EXPECT_GE(frame_buffer.stride[2],
+              (i * width + left_border + right_border) >> 1);
+    release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/libgav1_decoder.cmake b/src/libgav1_decoder.cmake
new file mode 100644 (file)
index 0000000..1314d0b
--- /dev/null
@@ -0,0 +1,157 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_)
+  return()
+endif() # LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_
+set(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_ 1)
+
+list(APPEND libgav1_decoder_sources
+            "${libgav1_source}/buffer_pool.cc"
+            "${libgav1_source}/buffer_pool.h"
+            "${libgav1_source}/decoder_impl.cc"
+            "${libgav1_source}/decoder_impl.h"
+            "${libgav1_source}/decoder_state.h"
+            "${libgav1_source}/tile_scratch_buffer.cc"
+            "${libgav1_source}/tile_scratch_buffer.h"
+            "${libgav1_source}/film_grain.cc"
+            "${libgav1_source}/film_grain.h"
+            "${libgav1_source}/frame_buffer.cc"
+            "${libgav1_source}/frame_buffer_utils.h"
+            "${libgav1_source}/frame_scratch_buffer.h"
+            "${libgav1_source}/inter_intra_masks.inc"
+            "${libgav1_source}/internal_frame_buffer_list.cc"
+            "${libgav1_source}/internal_frame_buffer_list.h"
+            "${libgav1_source}/loop_restoration_info.cc"
+            "${libgav1_source}/loop_restoration_info.h"
+            "${libgav1_source}/motion_vector.cc"
+            "${libgav1_source}/motion_vector.h"
+            "${libgav1_source}/obu_parser.cc"
+            "${libgav1_source}/obu_parser.h"
+            "${libgav1_source}/post_filter/cdef.cc"
+            "${libgav1_source}/post_filter/deblock.cc"
+            "${libgav1_source}/post_filter/deblock_thresholds.inc"
+            "${libgav1_source}/post_filter/loop_restoration.cc"
+            "${libgav1_source}/post_filter/post_filter.cc"
+            "${libgav1_source}/post_filter/super_res.cc"
+            "${libgav1_source}/post_filter.h"
+            "${libgav1_source}/prediction_mask.cc"
+            "${libgav1_source}/prediction_mask.h"
+            "${libgav1_source}/quantizer.cc"
+            "${libgav1_source}/quantizer.h"
+            "${libgav1_source}/quantizer_tables.inc"
+            "${libgav1_source}/reconstruction.cc"
+            "${libgav1_source}/reconstruction.h"
+            "${libgav1_source}/residual_buffer_pool.cc"
+            "${libgav1_source}/residual_buffer_pool.h"
+            "${libgav1_source}/scan_tables.inc"
+            "${libgav1_source}/symbol_decoder_context.cc"
+            "${libgav1_source}/symbol_decoder_context.h"
+            "${libgav1_source}/symbol_decoder_context_cdfs.inc"
+            "${libgav1_source}/threading_strategy.cc"
+            "${libgav1_source}/threading_strategy.h"
+            "${libgav1_source}/tile.h"
+            "${libgav1_source}/tile/bitstream/mode_info.cc"
+            "${libgav1_source}/tile/bitstream/palette.cc"
+            "${libgav1_source}/tile/bitstream/partition.cc"
+            "${libgav1_source}/tile/bitstream/transform_size.cc"
+            "${libgav1_source}/tile/prediction.cc"
+            "${libgav1_source}/tile/tile.cc"
+            "${libgav1_source}/warp_prediction.cc"
+            "${libgav1_source}/warp_prediction.h"
+            "${libgav1_source}/yuv_buffer.cc"
+            "${libgav1_source}/yuv_buffer.h")
+
+list(APPEND libgav1_api_includes "${libgav1_source}/gav1/decoder.h"
+            "${libgav1_source}/gav1/decoder_buffer.h"
+            "${libgav1_source}/gav1/decoder_settings.h"
+            "${libgav1_source}/gav1/frame_buffer.h"
+            "${libgav1_source}/gav1/status_code.h"
+            "${libgav1_source}/gav1/symbol_visibility.h"
+            "${libgav1_source}/gav1/version.h")
+
+list(APPEND libgav1_api_sources "${libgav1_source}/decoder.cc"
+            "${libgav1_source}/decoder_settings.cc"
+            "${libgav1_source}/status_code.cc"
+            "${libgav1_source}/version.cc"
+            ${libgav1_api_includes})
+
+macro(libgav1_add_decoder_targets)
+  if(BUILD_SHARED_LIBS)
+    if(MSVC OR WIN32)
+      # In order to produce a DLL and import library the Windows tools require
+      # that the exported symbols are part of the DLL target. The unfortunate
+      # side effect of this is that a single configuration cannot output both
+      # the static library and the DLL: This results in an either/or situation.
+      # Windows users of the libgav1 build can have a DLL and an import library,
+      # or they can have a static library; they cannot have both from a single
+      # configuration of the build.
+      list(APPEND libgav1_shared_lib_sources ${libgav1_api_sources})
+      list(APPEND libgav1_static_lib_sources ${libgav1_api_includes})
+    else()
+      list(APPEND libgav1_shared_lib_sources ${libgav1_api_includes})
+      list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+    endif()
+  else()
+    list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+  endif()
+
+  if(use_absl_threading)
+    list(APPEND libgav1_absl_deps absl::base absl::synchronization)
+  endif()
+
+  libgav1_add_library(NAME libgav1_decoder TYPE OBJECT SOURCES
+                      ${libgav1_decoder_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_library(NAME
+                      libgav1_static
+                      OUTPUT_NAME
+                      libgav1
+                      TYPE
+                      STATIC
+                      SOURCES
+                      ${libgav1_static_lib_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_include_paths}
+                      LIB_DEPS
+                      ${libgav1_absl_deps}
+                      OBJLIB_DEPS
+                      libgav1_dsp
+                      libgav1_decoder
+                      libgav1_utils
+                      PUBLIC_INCLUDES
+                      ${libgav1_source})
+
+  if(BUILD_SHARED_LIBS)
+    libgav1_add_library(NAME
+                        libgav1_shared
+                        OUTPUT_NAME
+                        libgav1
+                        TYPE
+                        SHARED
+                        SOURCES
+                        ${libgav1_shared_lib_sources}
+                        DEFINES
+                        ${libgav1_defines}
+                        INCLUDES
+                        ${libgav1_include_paths}
+                        LIB_DEPS
+                        libgav1_static
+                        PUBLIC_INCLUDES
+                        ${libgav1_source})
+  endif()
+endmacro()
diff --git a/src/loop_restoration_info.cc b/src/loop_restoration_info.cc
new file mode 100644 (file)
index 0000000..8c17711
--- /dev/null
@@ -0,0 +1,240 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/loop_restoration_info.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Controls how self guided deltas are read.
+constexpr int kSgrProjReadControl = 4;
+// Maps the restoration type encoded in the compressed headers (restoration_type
+// element in the spec) of the bitstream to LoopRestorationType. This is used
+// only when the restoration type in the frame header is
+// LoopRestorationTypeSwitchable.
+constexpr LoopRestorationType kBitstreamRestorationTypeMap[] = {
+    kLoopRestorationTypeNone, kLoopRestorationTypeWiener,
+    kLoopRestorationTypeSgrProj};
+
+inline int CountLeadingZeroCoefficients(const int16_t* const filter) {
+  int number_zero_coefficients = 0;
+  if (filter[0] == 0) {
+    number_zero_coefficients++;
+    if (filter[1] == 0) {
+      number_zero_coefficients++;
+      if (filter[2] == 0) {
+        number_zero_coefficients++;
+      }
+    }
+  }
+  return number_zero_coefficients;
+}
+
+}  // namespace
+
+bool LoopRestorationInfo::Reset(const LoopRestoration* const loop_restoration,
+                                uint32_t width, uint32_t height,
+                                int8_t subsampling_x, int8_t subsampling_y,
+                                bool is_monochrome) {
+  loop_restoration_ = loop_restoration;
+  subsampling_x_ = subsampling_x;
+  subsampling_y_ = subsampling_y;
+
+  const int num_planes = is_monochrome ? kMaxPlanesMonochrome : kMaxPlanes;
+  int total_num_units = 0;
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+      plane_needs_filtering_[plane] = false;
+      continue;
+    }
+    plane_needs_filtering_[plane] = true;
+    const int plane_width =
+        (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_);
+    const int plane_height =
+        (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_);
+    num_horizontal_units_[plane] =
+        std::max(1, RightShiftWithRounding(
+                        plane_width, loop_restoration_->unit_size_log2[plane]));
+    num_vertical_units_[plane] = std::max(
+        1, RightShiftWithRounding(plane_height,
+                                  loop_restoration_->unit_size_log2[plane]));
+    num_units_[plane] =
+        num_horizontal_units_[plane] * num_vertical_units_[plane];
+    total_num_units += num_units_[plane];
+  }
+  // Allocate the RestorationUnitInfo arrays for all planes in a single heap
+  // allocation and divide up the buffer into arrays of the right sizes.
+  if (!loop_restoration_info_buffer_.Resize(total_num_units)) {
+    return false;
+  }
+  RestorationUnitInfo* loop_restoration_info =
+      loop_restoration_info_buffer_.get();
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    loop_restoration_info_[plane] = loop_restoration_info;
+    loop_restoration_info += num_units_[plane];
+  }
+  return true;
+}
+
+bool LoopRestorationInfo::PopulateUnitInfoForSuperBlock(
+    Plane plane, BlockSize block_size, bool is_superres_scaled,
+    uint8_t superres_scale_denominator, int row4x4, int column4x4,
+    LoopRestorationUnitInfo* const unit_info) const {
+  assert(unit_info != nullptr);
+  if (!plane_needs_filtering_[plane]) return false;
+  const int numerator_column =
+      is_superres_scaled ? superres_scale_denominator : 1;
+  const int pixel_column_start =
+      RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_);
+  const int pixel_column_end = RowOrColumn4x4ToPixel(
+      column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_);
+  const int unit_row_log2 = loop_restoration_->unit_size_log2[plane];
+  const int denominator_column_log2 =
+      unit_row_log2 + (is_superres_scaled ? 3 : 0);
+  const int pixel_row_start =
+      RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_);
+  const int pixel_row_end = RowOrColumn4x4ToPixel(
+      row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_);
+  unit_info->column_start = RightShiftWithCeiling(
+      pixel_column_start * numerator_column, denominator_column_log2);
+  unit_info->column_end = RightShiftWithCeiling(
+      pixel_column_end * numerator_column, denominator_column_log2);
+  unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2);
+  unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2);
+  unit_info->column_end =
+      std::min(unit_info->column_end, num_horizontal_units_[plane]);
+  unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]);
+  return true;
+}
+
+void LoopRestorationInfo::ReadUnitCoefficients(
+    EntropyDecoder* const reader,
+    SymbolDecoderContext* const symbol_decoder_context, Plane plane,
+    int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  LoopRestorationType unit_restoration_type = kLoopRestorationTypeNone;
+  if (loop_restoration_->type[plane] == kLoopRestorationTypeSwitchable) {
+    unit_restoration_type = kBitstreamRestorationTypeMap
+        [reader->ReadSymbol<kRestorationTypeSymbolCount>(
+            symbol_decoder_context->restoration_type_cdf)];
+  } else if (loop_restoration_->type[plane] == kLoopRestorationTypeWiener) {
+    const bool use_wiener =
+        reader->ReadSymbol(symbol_decoder_context->use_wiener_cdf);
+    if (use_wiener) unit_restoration_type = kLoopRestorationTypeWiener;
+  } else if (loop_restoration_->type[plane] == kLoopRestorationTypeSgrProj) {
+    const bool use_sgrproj =
+        reader->ReadSymbol(symbol_decoder_context->use_sgrproj_cdf);
+    if (use_sgrproj) unit_restoration_type = kLoopRestorationTypeSgrProj;
+  }
+  loop_restoration_info_[plane][unit_id].type = unit_restoration_type;
+
+  if (unit_restoration_type == kLoopRestorationTypeWiener) {
+    ReadWienerInfo(reader, plane, unit_id, reference_unit_info);
+  } else if (unit_restoration_type == kLoopRestorationTypeSgrProj) {
+    ReadSgrProjInfo(reader, plane, unit_id, reference_unit_info);
+  }
+}
+
+void LoopRestorationInfo::ReadWienerInfo(
+    EntropyDecoder* const reader, Plane plane, int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+    if (plane != kPlaneY) {
+      loop_restoration_info_[plane][unit_id].wiener_info.filter[i][0] = 0;
+    }
+    int sum = 0;
+    for (int j = static_cast<int>(plane != kPlaneY); j < kNumWienerCoefficients;
+         ++j) {
+      const int8_t wiener_min = kWienerTapsMin[j];
+      const int8_t wiener_max = kWienerTapsMax[j];
+      const int control = j + 1;
+      int value;
+      if (!reader->DecodeSignedSubexpWithReference(
+              wiener_min, wiener_max + 1,
+              (*reference_unit_info)[plane].wiener_info.filter[i][j], control,
+              &value)) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "Error decoding Wiener filter coefficients: plane %d, unit_id %d",
+            static_cast<int>(plane), unit_id);
+        return;
+      }
+      loop_restoration_info_[plane][unit_id].wiener_info.filter[i][j] = value;
+      (*reference_unit_info)[plane].wiener_info.filter[i][j] = value;
+      sum += value;
+    }
+    loop_restoration_info_[plane][unit_id].wiener_info.filter[i][3] =
+        128 - 2 * sum;
+    loop_restoration_info_[plane][unit_id]
+        .wiener_info.number_leading_zero_coefficients[i] =
+        CountLeadingZeroCoefficients(
+            loop_restoration_info_[plane][unit_id].wiener_info.filter[i]);
+  }
+}
+
+void LoopRestorationInfo::ReadSgrProjInfo(
+    EntropyDecoder* const reader, Plane plane, int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  const int sgr_proj_index =
+      static_cast<int>(reader->ReadLiteral(kSgrProjParamsBits));
+  loop_restoration_info_[plane][unit_id].sgr_proj_info.index = sgr_proj_index;
+  for (int i = 0; i < 2; ++i) {
+    const uint8_t radius = kSgrProjParams[sgr_proj_index][i * 2];
+    const int8_t multiplier_min = kSgrProjMultiplierMin[i];
+    const int8_t multiplier_max = kSgrProjMultiplierMax[i];
+    int multiplier;
+    if (radius != 0) {
+      if (!reader->DecodeSignedSubexpWithReference(
+              multiplier_min, multiplier_max + 1,
+              (*reference_unit_info)[plane].sgr_proj_info.multiplier[i],
+              kSgrProjReadControl, &multiplier)) {
+        LIBGAV1_DLOG(ERROR,
+                     "Error decoding Self-guided filter coefficients: plane "
+                     "%d, unit_id %d",
+                     static_cast<int>(plane), unit_id);
+        return;
+      }
+    } else {
+      // The range of (*reference_unit_info)[plane].sgr_proj_info.multiplier[0]
+      // from DecodeSignedSubexpWithReference() is [-96, 31], the default is
+      // -32, making Clip3(128 - 31, -32, 95) unnecessary.
+      static constexpr int kMultiplier[2] = {0, 95};
+      multiplier = kMultiplier[i];
+      assert(
+          i == 0 ||
+          Clip3((1 << kSgrProjPrecisionBits) -
+                    (*reference_unit_info)[plane].sgr_proj_info.multiplier[0],
+                multiplier_min, multiplier_max) == kMultiplier[1]);
+    }
+    loop_restoration_info_[plane][unit_id].sgr_proj_info.multiplier[i] =
+        multiplier;
+    (*reference_unit_info)[plane].sgr_proj_info.multiplier[i] = multiplier;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/loop_restoration_info.h b/src/loop_restoration_info.h
new file mode 100644 (file)
index 0000000..bff6746
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+#define LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/dsp/common.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct LoopRestorationUnitInfo {
+  int row_start;
+  int row_end;
+  int column_start;
+  int column_end;
+};
+
+class LoopRestorationInfo {
+ public:
+  LoopRestorationInfo() = default;
+
+  // Non copyable/movable.
+  LoopRestorationInfo(const LoopRestorationInfo&) = delete;
+  LoopRestorationInfo& operator=(const LoopRestorationInfo&) = delete;
+  LoopRestorationInfo(LoopRestorationInfo&&) = delete;
+  LoopRestorationInfo& operator=(LoopRestorationInfo&&) = delete;
+
+  bool Reset(const LoopRestoration* loop_restoration, uint32_t width,
+             uint32_t height, int8_t subsampling_x, int8_t subsampling_y,
+             bool is_monochrome);
+  // Populates the |unit_info| for the super block at |row4x4|, |column4x4|.
+  // Returns true on success, false otherwise.
+  bool PopulateUnitInfoForSuperBlock(Plane plane, BlockSize block_size,
+                                     bool is_superres_scaled,
+                                     uint8_t superres_scale_denominator,
+                                     int row4x4, int column4x4,
+                                     LoopRestorationUnitInfo* unit_info) const;
+  void ReadUnitCoefficients(EntropyDecoder* reader,
+                            SymbolDecoderContext* symbol_decoder_context,
+                            Plane plane, int unit_id,
+                            std::array<RestorationUnitInfo, kMaxPlanes>*
+                                reference_unit_info);  // 5.11.58.
+  void ReadWienerInfo(
+      EntropyDecoder* reader, Plane plane, int unit_id,
+      std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+  void ReadSgrProjInfo(
+      EntropyDecoder* reader, Plane plane, int unit_id,
+      std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+
+  // Getters.
+  const RestorationUnitInfo* loop_restoration_info(Plane plane,
+                                                   int unit_id) const {
+    return &loop_restoration_info_[plane][unit_id];
+  }
+
+  int num_horizontal_units(Plane plane) const {
+    return num_horizontal_units_[plane];
+  }
+  int num_vertical_units(Plane plane) const {
+    return num_vertical_units_[plane];
+  }
+  int num_units(Plane plane) const { return num_units_[plane]; }
+
+ private:
+  // If plane_needs_filtering_[plane] is true, loop_restoration_info_[plane]
+  // points to an array of num_units_[plane] elements.
+  RestorationUnitInfo* loop_restoration_info_[kMaxPlanes];
+  // Owns the memory that loop_restoration_info_[plane] points to.
+  DynamicBuffer<RestorationUnitInfo> loop_restoration_info_buffer_;
+  bool plane_needs_filtering_[kMaxPlanes];
+  const LoopRestoration* loop_restoration_;
+  int8_t subsampling_x_;
+  int8_t subsampling_y_;
+  int num_horizontal_units_[kMaxPlanes];
+  int num_vertical_units_[kMaxPlanes];
+  int num_units_[kMaxPlanes];
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
diff --git a/src/motion_vector.cc b/src/motion_vector.cc
new file mode 100644 (file)
index 0000000..36018ab
--- /dev/null
@@ -0,0 +1,1000 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/motion_vector.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Entry at index i is computed as:
+// Clip3(std::max(kBlockWidthPixels[i], kBlockHeightPixels[i], 16, 112)).
+constexpr int kWarpValidThreshold[kMaxBlockSizes] = {
+    16, 16, 16, 16, 16, 16, 32, 16, 16,  16,  32,
+    64, 32, 32, 32, 64, 64, 64, 64, 112, 112, 112};
+
+// 7.10.2.10.
+void LowerMvPrecision(const ObuFrameHeader& frame_header,
+                      MotionVector* const mvs) {
+  if (frame_header.allow_high_precision_mv) return;
+  if (frame_header.force_integer_mv != 0) {
+    for (auto& mv : mvs->mv) {
+      // The next line is equivalent to:
+      // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+      // const int sign = mv >> 15;
+      // mv = ApplySign(value, sign);
+      mv = (mv + 3 - (mv >> 15)) & ~7;
+    }
+  } else {
+    for (auto& mv : mvs->mv) {
+      // The next line is equivalent to:
+      // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+      mv = (mv - (mv >> 15)) & ~1;
+    }
+  }
+}
+
+// 7.10.2.1.
+void SetupGlobalMv(const Tile::Block& block, int index,
+                   MotionVector* const mv) {
+  const BlockParameters& bp = *block.bp;
+  const ObuFrameHeader& frame_header = block.tile.frame_header();
+  ReferenceFrameType reference_type = bp.reference_frame[index];
+  const auto& gm = frame_header.global_motion[reference_type];
+  if (reference_type == kReferenceFrameIntra ||
+      gm.type == kGlobalMotionTransformationTypeIdentity) {
+    mv->mv32 = 0;
+    return;
+  }
+  if (gm.type == kGlobalMotionTransformationTypeTranslation) {
+    for (int i = 0; i < 2; ++i) {
+      mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3);
+    }
+    LowerMvPrecision(frame_header, mv);
+    return;
+  }
+  const int x = MultiplyBy4(block.column4x4) + DivideBy2(block.width) - 1;
+  const int y = MultiplyBy4(block.row4x4) + DivideBy2(block.height) - 1;
+  const int xc = (gm.params[2] - (1 << kWarpedModelPrecisionBits)) * x +
+                 gm.params[3] * y + gm.params[0];
+  const int yc = gm.params[4] * x +
+                 (gm.params[5] - (1 << kWarpedModelPrecisionBits)) * y +
+                 gm.params[1];
+  if (frame_header.allow_high_precision_mv) {
+    mv->mv[0] = RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 3);
+    mv->mv[1] = RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 3);
+  } else {
+    mv->mv[0] = MultiplyBy2(
+        RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 2));
+    mv->mv[1] = MultiplyBy2(
+        RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 2));
+    LowerMvPrecision(frame_header, mv);
+  }
+}
+
+constexpr BitMaskSet kPredictionModeNewMvMask(kPredictionModeNewMv,
+                                              kPredictionModeNewNewMv,
+                                              kPredictionModeNearNewMv,
+                                              kPredictionModeNewNearMv,
+                                              kPredictionModeNearestNewMv,
+                                              kPredictionModeNewNearestMv);
+
+// 7.10.2.8.
+void SearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+                 int index, int weight, bool* const found_new_mv,
+                 bool* const found_match, int* const num_mv_found) {
+  const BlockParameters& bp = *block.bp;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+      block.tile.frame_header().global_motion;
+  PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  MotionVector candidate_mv;
+  // LowerMvPrecision() is not necessary, since the values in
+  // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+  const auto global_motion_type = global_motion[bp.reference_frame[0]].type;
+  if (IsGlobalMvBlock(mv_bp, global_motion_type)) {
+    candidate_mv = prediction_parameters.global_mv[0];
+  } else {
+    candidate_mv = mv_bp.mv.mv[index];
+  }
+  *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+  *found_match = true;
+  MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+  const int num_found = *num_mv_found;
+  const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                                   [&candidate_mv](const MotionVector& ref_mv) {
+                                     return ref_mv.mv32 == candidate_mv.mv32;
+                                   });
+  if (result != ref_mv_stack + num_found) {
+    prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result),
+                                         weight);
+    return;
+  }
+  if (num_found >= kMaxRefMvStackSize) return;
+  ref_mv_stack[num_found] = candidate_mv;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+  ++*num_mv_found;
+}
+
+// 7.10.2.9.
+void CompoundSearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+                         int weight, bool* const found_new_mv,
+                         bool* const found_match, int* const num_mv_found) {
+  const BlockParameters& bp = *block.bp;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+      block.tile.frame_header().global_motion;
+  PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  // LowerMvPrecision() is not necessary, since the values in
+  // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+  CompoundMotionVector candidate_mv = mv_bp.mv;
+  for (int i = 0; i < 2; ++i) {
+    const auto global_motion_type = global_motion[bp.reference_frame[i]].type;
+    if (IsGlobalMvBlock(mv_bp, global_motion_type)) {
+      candidate_mv.mv[i] = prediction_parameters.global_mv[i];
+    }
+  }
+  *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+  *found_match = true;
+  CompoundMotionVector* const compound_ref_mv_stack =
+      prediction_parameters.compound_ref_mv_stack;
+  const int num_found = *num_mv_found;
+  const auto result =
+      std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+                   [&candidate_mv](const CompoundMotionVector& ref_mv) {
+                     return ref_mv.mv64 == candidate_mv.mv64;
+                   });
+  if (result != compound_ref_mv_stack + num_found) {
+    prediction_parameters.IncreaseWeight(
+        std::distance(compound_ref_mv_stack, result), weight);
+    return;
+  }
+  if (num_found >= kMaxRefMvStackSize) return;
+  compound_ref_mv_stack[num_found].mv64 = candidate_mv.mv64;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+  ++*num_mv_found;
+}
+
+// 7.10.2.7.
+void AddReferenceMvCandidate(const Tile::Block& block,
+                             const BlockParameters& mv_bp, bool is_compound,
+                             int weight, bool* const found_new_mv,
+                             bool* const found_match, int* const num_mv_found) {
+  if (!mv_bp.is_inter) return;
+  const BlockParameters& bp = *block.bp;
+  if (is_compound) {
+    if (mv_bp.reference_frame[0] == bp.reference_frame[0] &&
+        mv_bp.reference_frame[1] == bp.reference_frame[1]) {
+      CompoundSearchStack(block, mv_bp, weight, found_new_mv, found_match,
+                          num_mv_found);
+    }
+    return;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (mv_bp.reference_frame[i] == bp.reference_frame[0]) {
+      SearchStack(block, mv_bp, i, weight, found_new_mv, found_match,
+                  num_mv_found);
+    }
+  }
+}
+
+int GetMinimumStep(int block_width_or_height4x4, int delta_row_or_column) {
+  assert(delta_row_or_column < 0);
+  if (block_width_or_height4x4 >= 16) return 4;
+  if (delta_row_or_column < -1) return 2;
+  return 0;
+}
+
+// 7.10.2.2.
+void ScanRow(const Tile::Block& block, int mv_column, int delta_row,
+             bool is_compound, bool* const found_new_mv,
+             bool* const found_match, int* const num_mv_found) {
+  const int mv_row = block.row4x4 + delta_row;
+  const Tile& tile = block.tile;
+  if (!tile.IsTopInside(mv_row + 1)) return;
+  const int width4x4 = block.width4x4;
+  const int min_step = GetMinimumStep(width4x4, delta_row);
+  BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+  BlockParameters** const end_bps =
+      bps + std::min({static_cast<int>(width4x4),
+                      tile.frame_header().columns4x4 - block.column4x4, 16});
+  do {
+    const BlockParameters& mv_bp = **bps;
+    const int step = std::max(
+        std::min(width4x4, static_cast<int>(kNum4x4BlocksWide[mv_bp.size])),
+        min_step);
+    AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+                            found_new_mv, found_match, num_mv_found);
+    bps += step;
+  } while (bps < end_bps);
+}
+
+// 7.10.2.3.
+void ScanColumn(const Tile::Block& block, int mv_row, int delta_column,
+                bool is_compound, bool* const found_new_mv,
+                bool* const found_match, int* const num_mv_found) {
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsLeftInside(mv_column + 1)) return;
+  const int height4x4 = block.height4x4;
+  const int min_step = GetMinimumStep(height4x4, delta_column);
+  const ptrdiff_t stride = tile.BlockParametersStride();
+  BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+  BlockParameters** const end_bps =
+      bps + stride * std::min({static_cast<int>(height4x4),
+                               tile.frame_header().rows4x4 - block.row4x4, 16});
+  do {
+    const BlockParameters& mv_bp = **bps;
+    const int step = std::max(
+        std::min(height4x4, static_cast<int>(kNum4x4BlocksHigh[mv_bp.size])),
+        min_step);
+    AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+                            found_new_mv, found_match, num_mv_found);
+    bps += step * stride;
+  } while (bps < end_bps);
+}
+
+// 7.10.2.4.
+void ScanPoint(const Tile::Block& block, int delta_row, int delta_column,
+               bool is_compound, bool* const found_new_mv,
+               bool* const found_match, int* const num_mv_found) {
+  const int mv_row = block.row4x4 + delta_row;
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsInside(mv_row, mv_column) ||
+      !tile.HasParameters(mv_row, mv_column)) {
+    return;
+  }
+  const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+  if (mv_bp.reference_frame[0] == kReferenceFrameNone) return;
+  AddReferenceMvCandidate(block, mv_bp, is_compound, 4, found_new_mv,
+                          found_match, num_mv_found);
+}
+
+// 7.10.2.6.
+void AddTemporalReferenceMvCandidate(
+    const ObuFrameHeader& frame_header, const int reference_offsets[2],
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets, int count, bool is_compound,
+    int* const zero_mv_context, int* const num_mv_found,
+    PredictionParameters* const prediction_parameters) {
+  const int mv_projection_function_index =
+      frame_header.allow_high_precision_mv ? 2 : frame_header.force_integer_mv;
+  const MotionVector* const global_mv = prediction_parameters->global_mv;
+  if (is_compound) {
+    alignas(kMaxAlignment)
+        CompoundMotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+    const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+    dsp.mv_projection_compound[mv_projection_function_index](
+        temporal_mvs, temporal_reference_offsets, reference_offsets, count,
+        candidate_mvs);
+    if (*zero_mv_context == -1) {
+      int max_difference =
+          std::max(std::abs(candidate_mvs[0].mv[0].mv[0] - global_mv[0].mv[0]),
+                   std::abs(candidate_mvs[0].mv[0].mv[1] - global_mv[0].mv[1]));
+      max_difference =
+          std::max(max_difference,
+                   std::abs(candidate_mvs[0].mv[1].mv[0] - global_mv[1].mv[0]));
+      max_difference =
+          std::max(max_difference,
+                   std::abs(candidate_mvs[0].mv[1].mv[1] - global_mv[1].mv[1]));
+      *zero_mv_context = static_cast<int>(max_difference >= 16);
+    }
+    CompoundMotionVector* const compound_ref_mv_stack =
+        prediction_parameters->compound_ref_mv_stack;
+    int num_found = *num_mv_found;
+    int index = 0;
+    do {
+      const CompoundMotionVector& candidate_mv = candidate_mvs[index];
+      const auto result =
+          std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+                       [&candidate_mv](const CompoundMotionVector& ref_mv) {
+                         return ref_mv.mv64 == candidate_mv.mv64;
+                       });
+      if (result != compound_ref_mv_stack + num_found) {
+        prediction_parameters->IncreaseWeight(
+            std::distance(compound_ref_mv_stack, result), 2);
+        continue;
+      }
+      if (num_found >= kMaxRefMvStackSize) continue;
+      compound_ref_mv_stack[num_found].mv64 = candidate_mv.mv64;
+      prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+      ++num_found;
+    } while (++index < count);
+    *num_mv_found = num_found;
+    return;
+  }
+  MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack;
+  if (reference_offsets[0] == 0) {
+    if (*zero_mv_context == -1) {
+      const int max_difference =
+          std::max(std::abs(global_mv[0].mv[0]), std::abs(global_mv[0].mv[1]));
+      *zero_mv_context = static_cast<int>(max_difference >= 16);
+    }
+    const MotionVector candidate_mv = {};
+    const int num_found = *num_mv_found;
+    const auto result =
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                     [&candidate_mv](const MotionVector& ref_mv) {
+                       return ref_mv.mv32 == candidate_mv.mv32;
+                     });
+    if (result != ref_mv_stack + num_found) {
+      prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+                                            2 * count);
+      return;
+    }
+    if (num_found >= kMaxRefMvStackSize) return;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count);
+    ++*num_mv_found;
+    return;
+  }
+  alignas(kMaxAlignment)
+      MotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+  const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+  dsp.mv_projection_single[mv_projection_function_index](
+      temporal_mvs, temporal_reference_offsets, reference_offsets[0], count,
+      candidate_mvs);
+  if (*zero_mv_context == -1) {
+    const int max_difference =
+        std::max(std::abs(candidate_mvs[0].mv[0] - global_mv[0].mv[0]),
+                 std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1]));
+    *zero_mv_context = static_cast<int>(max_difference >= 16);
+  }
+  int num_found = *num_mv_found;
+  int index = 0;
+  do {
+    const MotionVector& candidate_mv = candidate_mvs[index];
+    const auto result =
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                     [&candidate_mv](const MotionVector& ref_mv) {
+                       return ref_mv.mv32 == candidate_mv.mv32;
+                     });
+    if (result != ref_mv_stack + num_found) {
+      prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+                                            2);
+      continue;
+    }
+    if (num_found >= kMaxRefMvStackSize) continue;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+    ++num_found;
+  } while (++index < count);
+  *num_mv_found = num_found;
+}
+
+// Part of 7.10.2.5.
+bool IsWithinTheSame64x64Block(const Tile::Block& block, int delta_row,
+                               int delta_column) {
+  const int row = (block.row4x4 & 15) + delta_row;
+  const int column = (block.column4x4 & 15) + delta_column;
+  // |block.height4x4| is at least 2 for all elements in |kTemporalScanMask|.
+  // So |row| are all non-negative.
+  assert(row >= 0);
+  return row < 16 && column >= 0 && column < 16;
+}
+
+constexpr BitMaskSet kTemporalScanMask(kBlock8x8, kBlock8x16, kBlock8x32,
+                                       kBlock16x8, kBlock16x16, kBlock16x32,
+                                       kBlock32x8, kBlock32x16, kBlock32x32);
+
+// 7.10.2.5.
+void TemporalScan(const Tile::Block& block, bool is_compound,
+                  int* const zero_mv_context, int* const num_mv_found) {
+  const int step_w = (block.width4x4 >= 16) ? 4 : 2;
+  const int step_h = (block.height4x4 >= 16) ? 4 : 2;
+  const int row_start = block.row4x4 | 1;
+  const int column_start = block.column4x4 | 1;
+  const int row_end =
+      row_start + std::min(static_cast<int>(block.height4x4), 16);
+  const int column_end =
+      column_start + std::min(static_cast<int>(block.width4x4), 16);
+  const Tile& tile = block.tile;
+  const TemporalMotionField& motion_field = tile.motion_field();
+  const int stride = motion_field.mv.columns();
+  const MotionVector* motion_field_mv = motion_field.mv[0];
+  const int8_t* motion_field_reference_offset =
+      motion_field.reference_offset[0];
+  alignas(kMaxAlignment)
+      MotionVector temporal_mvs[kMaxTemporalMvCandidatesWithPadding];
+  int8_t temporal_reference_offsets[kMaxTemporalMvCandidatesWithPadding];
+  int count = 0;
+  int offset = stride * (row_start >> 1);
+  int mv_row = row_start;
+  do {
+    int mv_column = column_start;
+    do {
+      // Both horizontal and vertical offsets are positive. Only bottom and
+      // right boundaries need to be checked.
+      if (tile.IsBottomRightInside(mv_row, mv_column)) {
+        const int x8 = mv_column >> 1;
+        const MotionVector temporal_mv = motion_field_mv[offset + x8];
+        if (temporal_mv.mv[0] == kInvalidMvValue) {
+          if (mv_row == row_start && mv_column == column_start) {
+            *zero_mv_context = 1;
+          }
+        } else {
+          temporal_mvs[count] = temporal_mv;
+          temporal_reference_offsets[count++] =
+              motion_field_reference_offset[offset + x8];
+        }
+      }
+      mv_column += step_w;
+    } while (mv_column < column_end);
+    offset += stride * step_h >> 1;
+    mv_row += step_h;
+  } while (mv_row < row_end);
+  if (kTemporalScanMask.Contains(block.size)) {
+    const int temporal_sample_positions[3][2] = {
+        {block.height4x4, -2},
+        {block.height4x4, block.width4x4},
+        {block.height4x4 - 2, block.width4x4}};
+    // Getting the address of an element in Array2D is slow. Precalculate the
+    // offsets.
+    int temporal_sample_offsets[3];
+    temporal_sample_offsets[0] = stride * ((row_start + block.height4x4) >> 1) +
+                                 ((column_start - 2) >> 1);
+    temporal_sample_offsets[1] =
+        temporal_sample_offsets[0] + ((block.width4x4 + 2) >> 1);
+    temporal_sample_offsets[2] = temporal_sample_offsets[1] - stride;
+    for (int i = 0; i < 3; i++) {
+      const int row = temporal_sample_positions[i][0];
+      const int column = temporal_sample_positions[i][1];
+      if (!IsWithinTheSame64x64Block(block, row, column)) continue;
+      const int mv_row = row_start + row;
+      const int mv_column = column_start + column;
+      // IsWithinTheSame64x64Block() guarantees the reference block is inside
+      // the top and left boundary.
+      if (!tile.IsBottomRightInside(mv_row, mv_column)) continue;
+      const MotionVector temporal_mv =
+          motion_field_mv[temporal_sample_offsets[i]];
+      if (temporal_mv.mv[0] != kInvalidMvValue) {
+        temporal_mvs[count] = temporal_mv;
+        temporal_reference_offsets[count++] =
+            motion_field_reference_offset[temporal_sample_offsets[i]];
+      }
+    }
+  }
+  if (count != 0) {
+    BlockParameters* const bp = block.bp;
+    int reference_offsets[2];
+    const int offset_0 = tile.current_frame()
+                             .reference_info()
+                             ->relative_distance_to[bp->reference_frame[0]];
+    reference_offsets[0] =
+        Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance);
+    if (is_compound) {
+      const int offset_1 = tile.current_frame()
+                               .reference_info()
+                               ->relative_distance_to[bp->reference_frame[1]];
+      reference_offsets[1] =
+          Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance);
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      if ((count & 1) != 0) {
+        temporal_mvs[count].mv32 = 0;
+        temporal_reference_offsets[count] = 0;
+      }
+    } else {
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      for (int i = count; i < ((count + 3) & ~3); ++i) {
+        temporal_mvs[i].mv32 = 0;
+        temporal_reference_offsets[i] = 0;
+      }
+    }
+    AddTemporalReferenceMvCandidate(
+        tile.frame_header(), reference_offsets, temporal_mvs,
+        temporal_reference_offsets, count, is_compound, zero_mv_context,
+        num_mv_found, &(*bp->prediction_parameters));
+  }
+}
+
+// Part of 7.10.2.13.
+void AddExtraCompoundMvCandidate(const Tile::Block& block, int mv_row,
+                                 int mv_column, int* const ref_id_count,
+                                 MotionVector ref_id[2][2],
+                                 int* const ref_diff_count,
+                                 MotionVector ref_diff[2][2]) {
+  const auto& bp = block.tile.Parameters(mv_row, mv_column);
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+      block.tile.reference_frame_sign_bias();
+  for (int i = 0; i < 2; ++i) {
+    const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+    if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+    for (int j = 0; j < 2; ++j) {
+      MotionVector candidate_mv = bp.mv.mv[i];
+      const ReferenceFrameType block_reference_frame =
+          block.bp->reference_frame[j];
+      if (candidate_reference_frame == block_reference_frame &&
+          ref_id_count[j] < 2) {
+        ref_id[j][ref_id_count[j]] = candidate_mv;
+        ++ref_id_count[j];
+      } else if (ref_diff_count[j] < 2) {
+        if (reference_frame_sign_bias[candidate_reference_frame] !=
+            reference_frame_sign_bias[block_reference_frame]) {
+          candidate_mv.mv[0] *= -1;
+          candidate_mv.mv[1] *= -1;
+        }
+        ref_diff[j][ref_diff_count[j]] = candidate_mv;
+        ++ref_diff_count[j];
+      }
+    }
+  }
+}
+
+// Part of 7.10.2.13.
+void AddExtraSingleMvCandidate(const Tile::Block& block, int mv_row,
+                               int mv_column, int* const num_mv_found) {
+  const auto& bp = block.tile.Parameters(mv_row, mv_column);
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+      block.tile.reference_frame_sign_bias();
+  const ReferenceFrameType block_reference_frame = block.bp->reference_frame[0];
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+  int num_found = *num_mv_found;
+  for (int i = 0; i < 2; ++i) {
+    const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+    if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+    MotionVector candidate_mv = bp.mv.mv[i];
+    if (reference_frame_sign_bias[candidate_reference_frame] !=
+        reference_frame_sign_bias[block_reference_frame]) {
+      candidate_mv.mv[0] *= -1;
+      candidate_mv.mv[1] *= -1;
+    }
+    assert(num_found <= 2);
+    if ((num_found != 0 && ref_mv_stack[0].mv32 == candidate_mv.mv32) ||
+        (num_found == 2 && ref_mv_stack[1].mv32 == candidate_mv.mv32)) {
+      continue;
+    }
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters.SetWeightIndexStackEntry(num_found, 0);
+    ++num_found;
+  }
+  *num_mv_found = num_found;
+}
+
+// 7.10.2.12.
+void ExtraSearch(const Tile::Block& block, bool is_compound,
+                 int* const num_mv_found) {
+  const Tile& tile = block.tile;
+  const int num4x4 = std::min({static_cast<int>(block.width4x4),
+                               tile.frame_header().columns4x4 - block.column4x4,
+                               static_cast<int>(block.height4x4),
+                               tile.frame_header().rows4x4 - block.row4x4, 16});
+  int ref_id_count[2] = {};
+  MotionVector ref_id[2][2] = {};
+  int ref_diff_count[2] = {};
+  MotionVector ref_diff[2][2] = {};
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int pass = 0; pass < 2 && *num_mv_found < 2; ++pass) {
+    for (int i = 0; i < num4x4;) {
+      const int mv_row = block.row4x4 + ((pass == 0) ? -1 : i);
+      const int mv_column = block.column4x4 + ((pass == 0) ? i : -1);
+      if (!tile.IsTopLeftInside(mv_row + 1, mv_column + 1)) break;
+      if (is_compound) {
+        AddExtraCompoundMvCandidate(block, mv_row, mv_column, ref_id_count,
+                                    ref_id, ref_diff_count, ref_diff);
+      } else {
+        AddExtraSingleMvCandidate(block, mv_row, mv_column, num_mv_found);
+        if (*num_mv_found >= 2) break;
+      }
+      const auto& bp = tile.Parameters(mv_row, mv_column);
+      i +=
+          (pass == 0) ? kNum4x4BlocksWide[bp.size] : kNum4x4BlocksHigh[bp.size];
+    }
+  }
+  if (is_compound) {
+    // Merge compound mode extra search into mv stack.
+    CompoundMotionVector* const compound_ref_mv_stack =
+        prediction_parameters.compound_ref_mv_stack;
+    CompoundMotionVector combined_mvs[2] = {};
+    for (int i = 0; i < 2; ++i) {
+      int count = 0;
+      assert(ref_id_count[i] <= 2);
+      for (int j = 0; j < ref_id_count[i]; ++j, ++count) {
+        combined_mvs[count].mv[i] = ref_id[i][j];
+      }
+      for (int j = 0; j < ref_diff_count[i] && count < 2; ++j, ++count) {
+        combined_mvs[count].mv[i] = ref_diff[i][j];
+      }
+      for (; count < 2; ++count) {
+        combined_mvs[count].mv[i] = prediction_parameters.global_mv[i];
+      }
+    }
+    if (*num_mv_found == 1) {
+      if (combined_mvs[0].mv64 == compound_ref_mv_stack[0].mv64) {
+        compound_ref_mv_stack[1].mv64 = combined_mvs[1].mv64;
+      } else {
+        compound_ref_mv_stack[1].mv64 = combined_mvs[0].mv64;
+      }
+      prediction_parameters.SetWeightIndexStackEntry(1, 0);
+    } else {
+      assert(*num_mv_found == 0);
+      for (int i = 0; i < 2; ++i) {
+        compound_ref_mv_stack[i].mv64 = combined_mvs[i].mv64;
+        prediction_parameters.SetWeightIndexStackEntry(i, 0);
+      }
+    }
+    *num_mv_found = 2;
+  } else {
+    // single prediction mode
+    MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+    for (int i = *num_mv_found; i < 2; ++i) {
+      ref_mv_stack[i] = prediction_parameters.global_mv[0];
+      prediction_parameters.SetWeightIndexStackEntry(i, 0);
+    }
+  }
+}
+
+void DescendingOrderTwo(int* const a, int* const b) {
+  if (*a < *b) {
+    std::swap(*a, *b);
+  }
+}
+
+// Comparator used for sorting candidate motion vectors in descending order of
+// their weights (as specified in 7.10.2.11).
+bool CompareCandidateMotionVectors(const int16_t& lhs, const int16_t& rhs) {
+  return lhs > rhs;
+}
+
+void SortWeightIndexStack(const int size, const int sort_to_n,
+                          int16_t* const weight_index_stack) {
+  if (size <= 1) return;
+  if (size <= 3) {
+    // Specialize small sort sizes to speed up.
+    int weight_index_0 = weight_index_stack[0];
+    int weight_index_1 = weight_index_stack[1];
+    DescendingOrderTwo(&weight_index_0, &weight_index_1);
+    if (size == 3) {
+      int weight_index_2 = weight_index_stack[2];
+      DescendingOrderTwo(&weight_index_1, &weight_index_2);
+      DescendingOrderTwo(&weight_index_0, &weight_index_1);
+      weight_index_stack[2] = weight_index_2;
+    }
+    weight_index_stack[0] = weight_index_0;
+    weight_index_stack[1] = weight_index_1;
+    return;
+  }
+  if (sort_to_n == 1) {
+    // std::max_element() is not efficient. Find the max element in a loop.
+    int16_t max_element = weight_index_stack[0];
+    int i = 1;
+    do {
+      max_element = std::max(max_element, weight_index_stack[i]);
+    } while (++i < size);
+    weight_index_stack[0] = max_element;
+    return;
+  }
+  std::partial_sort(&weight_index_stack[0], &weight_index_stack[sort_to_n],
+                    &weight_index_stack[size], CompareCandidateMotionVectors);
+}
+
+// 7.10.2.14 (part 2).
+void ComputeContexts(bool found_new_mv, int nearest_matches, int total_matches,
+                     int* new_mv_context, int* reference_mv_context) {
+  switch (nearest_matches) {
+    case 0:
+      *new_mv_context = std::min(total_matches, 1);
+      *reference_mv_context = total_matches;
+      break;
+    case 1:
+      *new_mv_context = 3 - static_cast<int>(found_new_mv);
+      *reference_mv_context = 2 + total_matches;
+      break;
+    default:
+      *new_mv_context = 5 - static_cast<int>(found_new_mv);
+      *reference_mv_context = 5;
+      break;
+  }
+}
+
+// 7.10.4.2.
+void AddSample(const Tile::Block& block, int delta_row, int delta_column,
+               int* const num_warp_samples, int* const num_samples_scanned,
+               int candidates[kMaxLeastSquaresSamples][4]) {
+  if (*num_samples_scanned >= kMaxLeastSquaresSamples) return;
+  const int mv_row = block.row4x4 + delta_row;
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsInside(mv_row, mv_column) ||
+      !tile.HasParameters(mv_row, mv_column)) {
+    return;
+  }
+  const BlockParameters& bp = *block.bp;
+  const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+  if (mv_bp.reference_frame[0] != bp.reference_frame[0] ||
+      mv_bp.reference_frame[1] != kReferenceFrameNone) {
+    return;
+  }
+  ++*num_samples_scanned;
+  const int candidate_height4x4 = kNum4x4BlocksHigh[mv_bp.size];
+  const int candidate_row = mv_row & ~(candidate_height4x4 - 1);
+  const int candidate_width4x4 = kNum4x4BlocksWide[mv_bp.size];
+  const int candidate_column = mv_column & ~(candidate_width4x4 - 1);
+  const BlockParameters& candidate_bp =
+      tile.Parameters(candidate_row, candidate_column);
+  const int mv_diff_row =
+      std::abs(candidate_bp.mv.mv[0].mv[0] - bp.mv.mv[0].mv[0]);
+  const int mv_diff_column =
+      std::abs(candidate_bp.mv.mv[0].mv[1] - bp.mv.mv[0].mv[1]);
+  const bool is_valid =
+      mv_diff_row + mv_diff_column <= kWarpValidThreshold[block.size];
+  if (!is_valid && *num_samples_scanned > 1) {
+    return;
+  }
+  const int mid_y =
+      MultiplyBy4(candidate_row) + MultiplyBy2(candidate_height4x4) - 1;
+  const int mid_x =
+      MultiplyBy4(candidate_column) + MultiplyBy2(candidate_width4x4) - 1;
+  candidates[*num_warp_samples][0] = MultiplyBy8(mid_y);
+  candidates[*num_warp_samples][1] = MultiplyBy8(mid_x);
+  candidates[*num_warp_samples][2] =
+      MultiplyBy8(mid_y) + candidate_bp.mv.mv[0].mv[0];
+  candidates[*num_warp_samples][3] =
+      MultiplyBy8(mid_x) + candidate_bp.mv.mv[0].mv[1];
+  if (is_valid) ++*num_warp_samples;
+}
+
+// 7.9.2.
+// In the spec, |dst_sign| is either 1 or -1. Here we set |dst_sign| to either 0
+// or -1 so that it can be XORed and subtracted directly in ApplySign() and
+// corresponding SIMD implementations.
+bool MotionFieldProjection(
+    const ObuFrameHeader& frame_header,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign,
+    int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* const motion_field) {
+  const int source_index =
+      frame_header.reference_frame_index[source - kReferenceFrameLast];
+  auto* const source_frame = reference_frames[source_index].get();
+  assert(source_frame != nullptr);
+  assert(dst_sign == 0 || dst_sign == -1);
+  if (source_frame->rows4x4() != frame_header.rows4x4 ||
+      source_frame->columns4x4() != frame_header.columns4x4 ||
+      IsIntraFrame(source_frame->frame_type())) {
+    return false;
+  }
+  assert(reference_to_current_with_sign >= -kMaxFrameDistance);
+  if (reference_to_current_with_sign > kMaxFrameDistance) return true;
+  const ReferenceInfo& reference_info = *source_frame->reference_info();
+  const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+  dsp.motion_field_projection_kernel(
+      reference_info, reference_to_current_with_sign, dst_sign, y8_start,
+      y8_end, x8_start, x8_end, motion_field);
+  return true;
+}
+
+}  // namespace
+
+void FindMvStack(const Tile::Block& block, bool is_compound,
+                 MvContexts* const contexts) {
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  SetupGlobalMv(block, 0, &prediction_parameters.global_mv[0]);
+  if (is_compound) SetupGlobalMv(block, 1, &prediction_parameters.global_mv[1]);
+  bool found_new_mv = false;
+  bool found_row_match = false;
+  int num_mv_found = 0;
+  ScanRow(block, block.column4x4, -1, is_compound, &found_new_mv,
+          &found_row_match, &num_mv_found);
+  bool found_column_match = false;
+  ScanColumn(block, block.row4x4, -1, is_compound, &found_new_mv,
+             &found_column_match, &num_mv_found);
+  if (std::max(block.width4x4, block.height4x4) <= 16) {
+    ScanPoint(block, -1, block.width4x4, is_compound, &found_new_mv,
+              &found_row_match, &num_mv_found);
+  }
+  const int nearest_matches =
+      static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+  prediction_parameters.nearest_mv_count = num_mv_found;
+  if (block.tile.frame_header().use_ref_frame_mvs) {
+    // Initialize to invalid value, and it will be set when temporal mv is zero.
+    contexts->zero_mv = -1;
+    TemporalScan(block, is_compound, &contexts->zero_mv, &num_mv_found);
+  } else {
+    contexts->zero_mv = 0;
+  }
+  bool dummy_bool = false;
+  ScanPoint(block, -1, -1, is_compound, &dummy_bool, &found_row_match,
+            &num_mv_found);
+  static constexpr int deltas[2] = {-3, -5};
+  for (int i = 0; i < 2; ++i) {
+    if (i == 0 || block.height4x4 > 1) {
+      ScanRow(block, block.column4x4 | 1, deltas[i] + (block.row4x4 & 1),
+              is_compound, &dummy_bool, &found_row_match, &num_mv_found);
+    }
+    if (i == 0 || block.width4x4 > 1) {
+      ScanColumn(block, block.row4x4 | 1, deltas[i] + (block.column4x4 & 1),
+                 is_compound, &dummy_bool, &found_column_match, &num_mv_found);
+    }
+  }
+  if (num_mv_found < 2) {
+    ExtraSearch(block, is_compound, &num_mv_found);
+  } else {
+    // The sort of |weight_index_stack| could be moved to Tile::AssignIntraMv()
+    // and Tile::AssignInterMv(), and only do a partial sort to the max index we
+    // need. However, the speed gain is trivial.
+    // For intra case, only the first 1 or 2 mvs in the stack will be used.
+    // For inter case, |prediction_parameters.ref_mv_index| is at most 3.
+    // We only need to do the partial sort up to the first 4 mvs.
+    SortWeightIndexStack(prediction_parameters.nearest_mv_count, 4,
+                         prediction_parameters.weight_index_stack);
+    // When there are 4 or more nearest mvs, the other mvs will not be used.
+    if (prediction_parameters.nearest_mv_count < 4) {
+      SortWeightIndexStack(
+          num_mv_found - prediction_parameters.nearest_mv_count,
+          4 - prediction_parameters.nearest_mv_count,
+          prediction_parameters.weight_index_stack +
+              prediction_parameters.nearest_mv_count);
+    }
+  }
+  prediction_parameters.ref_mv_count = num_mv_found;
+  const int total_matches =
+      static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+  ComputeContexts(found_new_mv, nearest_matches, total_matches,
+                  &contexts->new_mv, &contexts->reference_mv);
+  // The mv stack clamping process is in Tile::AssignIntraMv() and
+  // Tile::AssignInterMv(), and only up to two mvs are clamped.
+}
+
+void FindWarpSamples(const Tile::Block& block, int* const num_warp_samples,
+                     int* const num_samples_scanned,
+                     int candidates[kMaxLeastSquaresSamples][4]) {
+  const Tile& tile = block.tile;
+  bool top_left = true;
+  bool top_right = true;
+  int step = 1;
+  if (block.top_available[kPlaneY]) {
+    BlockSize source_size =
+        tile.Parameters(block.row4x4 - 1, block.column4x4).size;
+    const int source_width4x4 = kNum4x4BlocksWide[source_size];
+    if (block.width4x4 <= source_width4x4) {
+      // The & here is equivalent to % since source_width4x4 is a power of two.
+      const int column_offset = -(block.column4x4 & (source_width4x4 - 1));
+      if (column_offset < 0) top_left = false;
+      if (column_offset + source_width4x4 > block.width4x4) top_right = false;
+      AddSample(block, -1, 0, num_warp_samples, num_samples_scanned,
+                candidates);
+    } else {
+      for (int i = 0;
+           i < std::min(static_cast<int>(block.width4x4),
+                        tile.frame_header().columns4x4 - block.column4x4);
+           i += step) {
+        source_size =
+            tile.Parameters(block.row4x4 - 1, block.column4x4 + i).size;
+        step = std::min(static_cast<int>(block.width4x4),
+                        static_cast<int>(kNum4x4BlocksWide[source_size]));
+        AddSample(block, -1, i, num_warp_samples, num_samples_scanned,
+                  candidates);
+      }
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    BlockSize source_size =
+        tile.Parameters(block.row4x4, block.column4x4 - 1).size;
+    const int source_height4x4 = kNum4x4BlocksHigh[source_size];
+    if (block.height4x4 <= source_height4x4) {
+      const int row_offset = -(block.row4x4 & (source_height4x4 - 1));
+      if (row_offset < 0) top_left = false;
+      AddSample(block, 0, -1, num_warp_samples, num_samples_scanned,
+                candidates);
+    } else {
+      for (int i = 0; i < std::min(static_cast<int>(block.height4x4),
+                                   tile.frame_header().rows4x4 - block.row4x4);
+           i += step) {
+        source_size =
+            tile.Parameters(block.row4x4 + i, block.column4x4 - 1).size;
+        step = std::min(static_cast<int>(block.height4x4),
+                        static_cast<int>(kNum4x4BlocksHigh[source_size]));
+        AddSample(block, i, -1, num_warp_samples, num_samples_scanned,
+                  candidates);
+      }
+    }
+  }
+  if (top_left) {
+    AddSample(block, -1, -1, num_warp_samples, num_samples_scanned, candidates);
+  }
+  if (top_right && block.size <= kBlock64x64) {
+    AddSample(block, -1, block.width4x4, num_warp_samples, num_samples_scanned,
+              candidates);
+  }
+  if (*num_warp_samples == 0 && *num_samples_scanned > 0) *num_warp_samples = 1;
+}
+
+void SetupMotionField(
+    const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+    TemporalMotionField* const motion_field) {
+  assert(frame_header.use_ref_frame_mvs);
+  const int y8_start = DivideBy2(row4x4_start);
+  const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4));
+  const int x8_start = DivideBy2(column4x4_start);
+  const int x8_end =
+      DivideBy2(std::min(column4x4_end, frame_header.columns4x4));
+  const int last_index = frame_header.reference_frame_index[0];
+  const ReferenceInfo& reference_info = *current_frame.reference_info();
+  if (!IsIntraFrame(reference_frames[last_index]->frame_type())) {
+    const int last_alternate_order_hint =
+        reference_frames[last_index]
+            ->reference_info()
+            ->order_hint[kReferenceFrameAlternate];
+    const int current_gold_order_hint =
+        reference_info.order_hint[kReferenceFrameGolden];
+    if (last_alternate_order_hint != current_gold_order_hint) {
+      const int reference_offset_last =
+          -reference_info.relative_distance_from[kReferenceFrameLast];
+      if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameLast, reference_offset_last, -1,
+                              y8_start, y8_end, x8_start, x8_end, motion_field);
+      }
+    }
+  }
+  int ref_stamp = 1;
+  const int reference_offset_backward =
+      reference_info.relative_distance_from[kReferenceFrameBackward];
+  if (reference_offset_backward > 0 &&
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameBackward, reference_offset_backward,
+                            0, y8_start, y8_end, x8_start, x8_end,
+                            motion_field)) {
+    --ref_stamp;
+  }
+  const int reference_offset_alternate2 =
+      reference_info.relative_distance_from[kReferenceFrameAlternate2];
+  if (reference_offset_alternate2 > 0 &&
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameAlternate2,
+                            reference_offset_alternate2, 0, y8_start, y8_end,
+                            x8_start, x8_end, motion_field)) {
+    --ref_stamp;
+  }
+  if (ref_stamp >= 0) {
+    const int reference_offset_alternate =
+        reference_info.relative_distance_from[kReferenceFrameAlternate];
+    if (reference_offset_alternate > 0 &&
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameAlternate,
+                              reference_offset_alternate, 0, y8_start, y8_end,
+                              x8_start, x8_end, motion_field)) {
+      --ref_stamp;
+    }
+  }
+  if (ref_stamp >= 0) {
+    const int reference_offset_last2 =
+        -reference_info.relative_distance_from[kReferenceFrameLast2];
+    if (std::abs(reference_offset_last2) <= kMaxFrameDistance) {
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameLast2, reference_offset_last2, -1,
+                            y8_start, y8_end, x8_start, x8_end, motion_field);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/motion_vector.h b/src/motion_vector.h
new file mode 100644 (file)
index 0000000..68d14fe
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_MOTION_VECTOR_H_
+#define LIBGAV1_SRC_MOTION_VECTOR_H_
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/obu_parser.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr bool IsGlobalMvBlock(const BlockParameters& bp,
+                               GlobalMotionTransformationType type) {
+  return (bp.y_mode == kPredictionModeGlobalMv ||
+          bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+         !IsBlockDimension4(bp.size) &&
+         type > kGlobalMotionTransformationTypeTranslation;
+}
+
+// The |contexts| output parameter may be null. If the caller does not need
+// the |contexts| output, pass nullptr as the argument.
+void FindMvStack(const Tile::Block& block, bool is_compound,
+                 MvContexts* contexts);  // 7.10.2
+
+void FindWarpSamples(const Tile::Block& block, int* num_warp_samples,
+                     int* num_samples_scanned,
+                     int candidates[kMaxLeastSquaresSamples][4]);  // 7.10.4.
+
+// Section 7.9.1 in the spec. But this is done per tile instead of for the whole
+// frame.
+void SetupMotionField(
+    const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+    TemporalMotionField* motion_field);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_MOTION_VECTOR_H_
diff --git a/src/obu_parser.cc b/src/obu_parser.cc
new file mode 100644 (file)
index 0000000..d1815ed
--- /dev/null
@@ -0,0 +1,3021 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/motion_vector.h"
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// 5.9.16.
+// Find the smallest value of k such that block_size << k is greater than or
+// equal to target.
+//
+// NOTE: TileLog2(block_size, target) is equal to
+//   CeilLog2(ceil((double)target / block_size))
+// where the division is a floating-point number division. (This equality holds
+// even when |target| is equal to 0.) In the special case of block_size == 1,
+// TileLog2(1, target) is equal to CeilLog2(target).
+int TileLog2(int block_size, int target) {
+  int k = 0;
+  for (; (block_size << k) < target; ++k) {
+  }
+  return k;
+}
+
+void ParseBitStreamLevel(BitStreamLevel* const level, uint8_t level_bits) {
+  level->major = kMinimumMajorBitstreamLevel + (level_bits >> 2);
+  level->minor = level_bits & 3;
+}
+
+// This function assumes loop_filter is zero-initialized, so only it needs to
+// set the nonzero default values.
+void SetDefaultRefDeltas(LoopFilter* const loop_filter) {
+  loop_filter->ref_deltas[kReferenceFrameIntra] = 1;
+  loop_filter->ref_deltas[kReferenceFrameGolden] = -1;
+  loop_filter->ref_deltas[kReferenceFrameAlternate] = -1;
+  loop_filter->ref_deltas[kReferenceFrameAlternate2] = -1;
+}
+
+bool InTemporalLayer(int operating_point_idc, int temporal_id) {
+  return ((operating_point_idc >> temporal_id) & 1) != 0;
+}
+
+bool InSpatialLayer(int operating_point_idc, int spatial_id) {
+  return ((operating_point_idc >> (spatial_id + 8)) & 1) != 0;
+}
+
+// Returns the index of the last nonzero byte in the |data| buffer of |size|
+// bytes. If there is no nonzero byte in the |data| buffer, returns -1.
+int GetLastNonzeroByteIndex(const uint8_t* data, size_t size) {
+  // Scan backward for a nonzero byte.
+  if (size > INT_MAX) return -1;
+  int i = static_cast<int>(size) - 1;
+  while (i >= 0 && data[i] == 0) {
+    --i;
+  }
+  return i;
+}
+
+// A cleanup helper class that releases the frame buffer reference held in
+// |frame| in the destructor.
+class RefCountedBufferPtrCleanup {
+ public:
+  explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame)
+      : frame_(*frame) {}
+
+  // Not copyable or movable.
+  RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete;
+  RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) =
+      delete;
+
+  ~RefCountedBufferPtrCleanup() { frame_ = nullptr; }
+
+ private:
+  RefCountedBufferPtr& frame_;
+};
+
+}  // namespace
+
+bool ObuSequenceHeader::ParametersChanged(const ObuSequenceHeader& old) const {
+  // Note that the operating_parameters field is not compared per Section 7.5:
+  //   Within a particular coded video sequence, the contents of
+  //   sequence_header_obu must be bit-identical each time the sequence header
+  //   appears except for the contents of operating_parameters_info.
+  return memcmp(this, &old,
+                offsetof(ObuSequenceHeader, operating_parameters)) != 0;
+}
+
+// Macros to avoid repeated error checks in the parser code.
+#define OBU_LOG_AND_RETURN_FALSE                                            \
+  do {                                                                      \
+    LIBGAV1_DLOG(ERROR, "%s:%d (%s): Not enough bits.", __FILE__, __LINE__, \
+                 __func__);                                                 \
+    return false;                                                           \
+  } while (false)
+#define OBU_PARSER_FAIL         \
+  do {                          \
+    if (scratch == -1) {        \
+      OBU_LOG_AND_RETURN_FALSE; \
+    }                           \
+  } while (false)
+#define OBU_READ_BIT_OR_FAIL        \
+  scratch = bit_reader_->ReadBit(); \
+  OBU_PARSER_FAIL
+#define OBU_READ_LITERAL_OR_FAIL(n)      \
+  scratch = bit_reader_->ReadLiteral(n); \
+  OBU_PARSER_FAIL
+#define OBU_READ_UVLC_OR_FAIL(x)        \
+  do {                                  \
+    if (!bit_reader_->ReadUvlc(&(x))) { \
+      OBU_LOG_AND_RETURN_FALSE;         \
+    }                                   \
+  } while (false)
+
+bool ObuParser::ParseColorConfig(ObuSequenceHeader* sequence_header) {
+  int64_t scratch;
+  ColorConfig* const color_config = &sequence_header->color_config;
+  OBU_READ_BIT_OR_FAIL;
+  const bool high_bitdepth = scratch != 0;
+  if (sequence_header->profile == kProfile2 && high_bitdepth) {
+    OBU_READ_BIT_OR_FAIL;
+    const bool is_twelve_bit = scratch != 0;
+    color_config->bitdepth = is_twelve_bit ? 12 : 10;
+  } else {
+    color_config->bitdepth = high_bitdepth ? 10 : 8;
+  }
+  if (sequence_header->profile == kProfile1) {
+    color_config->is_monochrome = false;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    color_config->is_monochrome = scratch != 0;
+  }
+  OBU_READ_BIT_OR_FAIL;
+  const bool color_description_present_flag = scratch != 0;
+  if (color_description_present_flag) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->color_primary = static_cast<ColorPrimary>(scratch);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->transfer_characteristics =
+        static_cast<TransferCharacteristics>(scratch);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->matrix_coefficients =
+        static_cast<MatrixCoefficients>(scratch);
+  } else {
+    color_config->color_primary = kColorPrimaryUnspecified;
+    color_config->transfer_characteristics =
+        kTransferCharacteristicsUnspecified;
+    color_config->matrix_coefficients = kMatrixCoefficientsUnspecified;
+  }
+  if (color_config->is_monochrome) {
+    OBU_READ_BIT_OR_FAIL;
+    color_config->color_range = static_cast<ColorRange>(scratch);
+    // Set subsampling_x and subsampling_y to 1 for monochrome. This makes it
+    // easy to allow monochrome to be supported in profile 0. Profile 0
+    // requires subsampling_x and subsampling_y to be 1.
+    color_config->subsampling_x = 1;
+    color_config->subsampling_y = 1;
+    color_config->chroma_sample_position = kChromaSamplePositionUnknown;
+  } else {
+    if (color_config->color_primary == kColorPrimaryBt709 &&
+        color_config->transfer_characteristics ==
+            kTransferCharacteristicsSrgb &&
+        color_config->matrix_coefficients == kMatrixCoefficientsIdentity) {
+      color_config->color_range = kColorRangeFull;
+      color_config->subsampling_x = 0;
+      color_config->subsampling_y = 0;
+      // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12.
+      // See the table at the beginning of Section 6.4.1.
+      if (sequence_header->profile != kProfile1 &&
+          (sequence_header->profile != kProfile2 ||
+           color_config->bitdepth != 12)) {
+        LIBGAV1_DLOG(ERROR,
+                     "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.",
+                     sequence_header->profile, color_config->bitdepth);
+        return false;
+      }
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      color_config->color_range = static_cast<ColorRange>(scratch);
+      if (sequence_header->profile == kProfile0) {
+        color_config->subsampling_x = 1;
+        color_config->subsampling_y = 1;
+      } else if (sequence_header->profile == kProfile1) {
+        color_config->subsampling_x = 0;
+        color_config->subsampling_y = 0;
+      } else {
+        if (color_config->bitdepth == 12) {
+          OBU_READ_BIT_OR_FAIL;
+          color_config->subsampling_x = scratch;
+          if (color_config->subsampling_x == 1) {
+            OBU_READ_BIT_OR_FAIL;
+            color_config->subsampling_y = scratch;
+          } else {
+            color_config->subsampling_y = 0;
+          }
+        } else {
+          color_config->subsampling_x = 1;
+          color_config->subsampling_y = 0;
+        }
+      }
+      if (color_config->subsampling_x == 1 &&
+          color_config->subsampling_y == 1) {
+        OBU_READ_LITERAL_OR_FAIL(2);
+        color_config->chroma_sample_position =
+            static_cast<ChromaSamplePosition>(scratch);
+      }
+    }
+    OBU_READ_BIT_OR_FAIL;
+    color_config->separate_uv_delta_q = scratch != 0;
+  }
+  if (color_config->matrix_coefficients == kMatrixCoefficientsIdentity &&
+      (color_config->subsampling_x != 0 || color_config->subsampling_y != 0)) {
+    LIBGAV1_DLOG(ERROR,
+                 "matrix_coefficients is MC_IDENTITY, but subsampling_x (%d) "
+                 "and subsampling_y (%d) are not both 0.",
+                 color_config->subsampling_x, color_config->subsampling_y);
+    return false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseTimingInfo(ObuSequenceHeader* sequence_header) {
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->timing_info_present_flag = scratch != 0;
+  if (!sequence_header->timing_info_present_flag) return true;
+  TimingInfo* const info = &sequence_header->timing_info;
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->num_units_in_tick = static_cast<uint32_t>(scratch);
+  if (info->num_units_in_tick == 0) {
+    LIBGAV1_DLOG(ERROR, "num_units_in_tick is 0.");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->time_scale = static_cast<uint32_t>(scratch);
+  if (info->time_scale == 0) {
+    LIBGAV1_DLOG(ERROR, "time_scale is 0.");
+    return false;
+  }
+  OBU_READ_BIT_OR_FAIL;
+  info->equal_picture_interval = scratch != 0;
+  if (info->equal_picture_interval) {
+    OBU_READ_UVLC_OR_FAIL(info->num_ticks_per_picture);
+    ++info->num_ticks_per_picture;
+  }
+  return true;
+}
+
+bool ObuParser::ParseDecoderModelInfo(ObuSequenceHeader* sequence_header) {
+  if (!sequence_header->timing_info_present_flag) return true;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->decoder_model_info_present_flag = scratch != 0;
+  if (!sequence_header->decoder_model_info_present_flag) return true;
+  DecoderModelInfo* const info = &sequence_header->decoder_model_info;
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->encoder_decoder_buffer_delay_length = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->num_units_in_decoding_tick = static_cast<uint32_t>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->buffer_removal_time_length = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->frame_presentation_time_length = 1 + scratch;
+  return true;
+}
+
+bool ObuParser::ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+                                         int index) {
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->decoder_model_present_for_operating_point[index] =
+      scratch != 0;
+  if (!sequence_header->decoder_model_present_for_operating_point[index]) {
+    return true;
+  }
+  OperatingParameters* const params = &sequence_header->operating_parameters;
+  OBU_READ_LITERAL_OR_FAIL(
+      sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+  params->decoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(
+      sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+  params->encoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  params->low_delay_mode_flag[index] = scratch != 0;
+  return true;
+}
+
+bool ObuParser::ParseSequenceHeader(bool seen_frame_header) {
+  ObuSequenceHeader sequence_header = {};
+  int64_t scratch;
+  OBU_READ_LITERAL_OR_FAIL(3);
+  if (scratch >= kMaxProfiles) {
+    LIBGAV1_DLOG(ERROR, "Invalid profile: %d.", static_cast<int>(scratch));
+    return false;
+  }
+  sequence_header.profile = static_cast<BitstreamProfile>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.still_picture = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.reduced_still_picture_header = scratch != 0;
+  if (sequence_header.reduced_still_picture_header) {
+    if (!sequence_header.still_picture) {
+      LIBGAV1_DLOG(
+          ERROR, "reduced_still_picture_header is 1, but still_picture is 0.");
+      return false;
+    }
+    sequence_header.operating_points = 1;
+    sequence_header.operating_point_idc[0] = 0;
+    OBU_READ_LITERAL_OR_FAIL(5);
+    ParseBitStreamLevel(&sequence_header.level[0], scratch);
+  } else {
+    if (!ParseTimingInfo(&sequence_header) ||
+        !ParseDecoderModelInfo(&sequence_header)) {
+      return false;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    const bool initial_display_delay_present_flag = scratch != 0;
+    OBU_READ_LITERAL_OR_FAIL(5);
+    sequence_header.operating_points = static_cast<int>(1 + scratch);
+    if (operating_point_ >= sequence_header.operating_points) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "Invalid operating point: %d (valid range is [0,%d] inclusive).",
+          operating_point_, sequence_header.operating_points - 1);
+      return false;
+    }
+    for (int i = 0; i < sequence_header.operating_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(12);
+      sequence_header.operating_point_idc[i] = static_cast<int>(scratch);
+      for (int j = 0; j < i; ++j) {
+        if (sequence_header.operating_point_idc[i] ==
+            sequence_header.operating_point_idc[j]) {
+          LIBGAV1_DLOG(ERROR,
+                       "operating_point_idc[%d] (%d) is equal to "
+                       "operating_point_idc[%d] (%d).",
+                       i, sequence_header.operating_point_idc[i], j,
+                       sequence_header.operating_point_idc[j]);
+          return false;
+        }
+      }
+      OBU_READ_LITERAL_OR_FAIL(5);
+      ParseBitStreamLevel(&sequence_header.level[i], scratch);
+      if (sequence_header.level[i].major > 3) {
+        OBU_READ_BIT_OR_FAIL;
+        sequence_header.tier[i] = scratch;
+      }
+      if (sequence_header.decoder_model_info_present_flag &&
+          !ParseOperatingParameters(&sequence_header, i)) {
+        return false;
+      }
+      if (initial_display_delay_present_flag) {
+        OBU_READ_BIT_OR_FAIL;
+        if (scratch != 0) {
+          OBU_READ_LITERAL_OR_FAIL(4);
+          sequence_header.initial_display_delay[i] = 1 + scratch;
+        }
+      }
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(4);
+  sequence_header.frame_width_bits = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(4);
+  sequence_header.frame_height_bits = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_width_bits);
+  sequence_header.max_frame_width = static_cast<int32_t>(1 + scratch);
+  OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_height_bits);
+  sequence_header.max_frame_height = static_cast<int32_t>(1 + scratch);
+  if (!sequence_header.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.frame_id_numbers_present = scratch != 0;
+  }
+  if (sequence_header.frame_id_numbers_present) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    sequence_header.delta_frame_id_length_bits = 2 + scratch;
+    OBU_READ_LITERAL_OR_FAIL(3);
+    sequence_header.frame_id_length_bits =
+        sequence_header.delta_frame_id_length_bits + 1 + scratch;
+    // Section 6.8.2: It is a requirement of bitstream conformance that the
+    // number of bits needed to read display_frame_id does not exceed 16. This
+    // is equivalent to the constraint that idLen <= 16.
+    if (sequence_header.frame_id_length_bits > 16) {
+      LIBGAV1_DLOG(ERROR, "Invalid frame_id_length_bits: %d.",
+                   sequence_header.frame_id_length_bits);
+      return false;
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.use_128x128_superblock = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_filter_intra = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_intra_edge_filter = scratch != 0;
+  if (sequence_header.reduced_still_picture_header) {
+    sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+    sequence_header.force_integer_mv = kSelectIntegerMv;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_interintra_compound = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_masked_compound = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_warped_motion = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_dual_filter = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_order_hint = scratch != 0;
+    if (sequence_header.enable_order_hint) {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.enable_jnt_comp = scratch != 0;
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.enable_ref_frame_mvs = scratch != 0;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.choose_screen_content_tools = scratch != 0;
+    if (sequence_header.choose_screen_content_tools) {
+      sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.force_screen_content_tools = scratch;
+    }
+    if (sequence_header.force_screen_content_tools > 0) {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.choose_integer_mv = scratch != 0;
+      if (sequence_header.choose_integer_mv) {
+        sequence_header.force_integer_mv = kSelectIntegerMv;
+      } else {
+        OBU_READ_BIT_OR_FAIL;
+        sequence_header.force_integer_mv = scratch;
+      }
+    } else {
+      sequence_header.force_integer_mv = kSelectIntegerMv;
+    }
+    if (sequence_header.enable_order_hint) {
+      OBU_READ_LITERAL_OR_FAIL(3);
+      sequence_header.order_hint_bits = 1 + scratch;
+      sequence_header.order_hint_shift_bits =
+          Mod32(32 - sequence_header.order_hint_bits);
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_superres = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_cdef = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_restoration = scratch != 0;
+  if (!ParseColorConfig(&sequence_header)) return false;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.film_grain_params_present = scratch != 0;
+  // Compare new sequence header with old sequence header.
+  if (has_sequence_header_ &&
+      sequence_header.ParametersChanged(sequence_header_)) {
+    // Between the frame header OBU and the last tile group OBU of the frame,
+    // do not allow the sequence header to change.
+    if (seen_frame_header) {
+      LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
+      return false;
+    }
+    sequence_header_changed_ = true;
+    decoder_state_.ClearReferenceFrames();
+  }
+  sequence_header_ = sequence_header;
+  if (!has_sequence_header_) {
+    sequence_header_changed_ = true;
+  }
+  has_sequence_header_ = true;
+  // Section 6.4.1: It is a requirement of bitstream conformance that if
+  // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
+  // all OBUs that follow this sequence header until the next sequence header.
+  extension_disallowed_ =
+      (sequence_header_.operating_point_idc[operating_point_] == 0);
+  return true;
+}
+
+// Marks reference frames as invalid for referencing when they are too far in
+// the past to be referenced by the frame id mechanism.
+void ObuParser::MarkInvalidReferenceFrames() {
+  // The current lower bound of the frame ids for reference frames.
+  int lower_bound = decoder_state_.current_frame_id -
+                    (1 << sequence_header_.delta_frame_id_length_bits);
+  // True if lower_bound is smaller than current_frame_id. False if lower_bound
+  // wraps around (in modular arithmetic) to the other side of current_frame_id.
+  bool lower_bound_is_smaller = true;
+  if (lower_bound <= 0) {
+    lower_bound += 1 << sequence_header_.frame_id_length_bits;
+    lower_bound_is_smaller = false;
+  }
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const uint16_t reference_frame_id = decoder_state_.reference_frame_id[i];
+    if (lower_bound_is_smaller) {
+      if (reference_frame_id > decoder_state_.current_frame_id ||
+          reference_frame_id < lower_bound) {
+        decoder_state_.reference_frame[i] = nullptr;
+      }
+    } else {
+      if (reference_frame_id > decoder_state_.current_frame_id &&
+          reference_frame_id < lower_bound) {
+        decoder_state_.reference_frame[i] = nullptr;
+      }
+    }
+  }
+}
+
+bool ObuParser::ParseFrameSizeAndRenderSize() {
+  int64_t scratch;
+  // Frame Size.
+  if (frame_header_.frame_size_override_flag) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_width_bits);
+    frame_header_.width = static_cast<int32_t>(1 + scratch);
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_height_bits);
+    frame_header_.height = static_cast<int32_t>(1 + scratch);
+    if (frame_header_.width > sequence_header_.max_frame_width ||
+        frame_header_.height > sequence_header_.max_frame_height) {
+      LIBGAV1_DLOG(ERROR,
+                   "Frame dimensions are larger than the maximum values");
+      return false;
+    }
+  } else {
+    frame_header_.width = sequence_header_.max_frame_width;
+    frame_header_.height = sequence_header_.max_frame_height;
+  }
+  if (!ParseSuperResParametersAndComputeImageSize()) return false;
+
+  // Render Size.
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.render_and_frame_size_different = scratch != 0;
+  if (frame_header_.render_and_frame_size_different) {
+    OBU_READ_LITERAL_OR_FAIL(16);
+    frame_header_.render_width = static_cast<int32_t>(1 + scratch);
+    OBU_READ_LITERAL_OR_FAIL(16);
+    frame_header_.render_height = static_cast<int32_t>(1 + scratch);
+  } else {
+    frame_header_.render_width = frame_header_.upscaled_width;
+    frame_header_.render_height = frame_header_.height;
+  }
+
+  return true;
+}
+
+bool ObuParser::ParseSuperResParametersAndComputeImageSize() {
+  int64_t scratch;
+  // SuperRes.
+  frame_header_.upscaled_width = frame_header_.width;
+  frame_header_.use_superres = false;
+  if (sequence_header_.enable_superres) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.use_superres = scratch != 0;
+  }
+  if (frame_header_.use_superres) {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    // 9 is the smallest value for the denominator.
+    frame_header_.superres_scale_denominator = scratch + 9;
+    frame_header_.width =
+        (frame_header_.upscaled_width * kSuperResScaleNumerator +
+         (frame_header_.superres_scale_denominator / 2)) /
+        frame_header_.superres_scale_denominator;
+  } else {
+    frame_header_.superres_scale_denominator = kSuperResScaleNumerator;
+  }
+  assert(frame_header_.width != 0);
+  assert(frame_header_.height != 0);
+  // Check if multiplying upscaled_width by height would overflow.
+  assert(frame_header_.upscaled_width >= frame_header_.width);
+  if (frame_header_.upscaled_width > INT32_MAX / frame_header_.height) {
+    LIBGAV1_DLOG(ERROR, "Frame dimensions too big: width=%d height=%d.",
+                 frame_header_.width, frame_header_.height);
+    return false;
+  }
+  frame_header_.columns4x4 = ((frame_header_.width + 7) >> 3) << 1;
+  frame_header_.rows4x4 = ((frame_header_.height + 7) >> 3) << 1;
+  return true;
+}
+
+bool ObuParser::ValidateInterFrameSize() const {
+  for (int index : frame_header_.reference_frame_index) {
+    const RefCountedBuffer* reference_frame =
+        decoder_state_.reference_frame[index].get();
+    if (2 * frame_header_.width < reference_frame->upscaled_width() ||
+        2 * frame_header_.height < reference_frame->frame_height() ||
+        frame_header_.width > 16 * reference_frame->upscaled_width() ||
+        frame_header_.height > 16 * reference_frame->frame_height()) {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid inter frame size: width=%d, height=%d. Reference "
+                   "frame: index=%d, upscaled width=%d, height=%d.",
+                   frame_header_.width, frame_header_.height, index,
+                   reference_frame->upscaled_width(),
+                   reference_frame->frame_height());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseReferenceOrderHint() {
+  if (!frame_header_.error_resilient_mode ||
+      !sequence_header_.enable_order_hint) {
+    return true;
+  }
+  int64_t scratch;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+    frame_header_.reference_order_hint[i] = scratch;
+    if (frame_header_.reference_order_hint[i] !=
+        decoder_state_.reference_order_hint[i]) {
+      decoder_state_.reference_frame[i] = nullptr;
+    }
+  }
+  return true;
+}
+
+// static
+int ObuParser::FindLatestBackwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int latest_order_hint = INT_MIN;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint >= current_frame_hint &&
+        hint >= latest_order_hint) {
+      ref = i;
+      latest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindEarliestBackwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int earliest_order_hint = INT_MAX;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint >= current_frame_hint &&
+        hint < earliest_order_hint) {
+      ref = i;
+      earliest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindLatestForwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int latest_order_hint = INT_MIN;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint < current_frame_hint &&
+        hint >= latest_order_hint) {
+      ref = i;
+      latest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindReferenceWithSmallestOutputOrder(
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints) {
+  int ref = -1;
+  int earliest_order_hint = INT_MAX;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (hint < earliest_order_hint) {
+      ref = i;
+      earliest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// Computes the elements in the frame_header_.reference_frame_index array
+// based on:
+// * the syntax elements last_frame_idx and gold_frame_idx, and
+// * the values stored within the decoder_state_.reference_order_hint array
+//   (these values represent the least significant bits of the expected output
+//   order of the frames).
+//
+// Frame type: {
+//       libgav1_name              spec_name              int
+//   kReferenceFrameLast,          LAST_FRAME              1
+//   kReferenceFrameLast2,         LAST2_FRAME             2
+//   kReferenceFrameLast3,         LAST3_FRAME             3
+//   kReferenceFrameGolden,        GOLDEN_FRAME            4
+//   kReferenceFrameBackward,      BWDREF_FRAME            5
+//   kReferenceFrameAlternate2,    ALTREF2_FRAME           6
+//   kReferenceFrameAlternate,     ALTREF_FRAME            7
+// }
+//
+// A typical case of a group of pictures (frames) in display order:
+// (However, more complex cases are possibly allowed in terms of
+// bitstream conformance.)
+//
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+//
+// 4         3         2         1   current_frame   5         6         7
+//
+bool ObuParser::SetFrameReferences(const int8_t last_frame_idx,
+                                   const int8_t gold_frame_idx) {
+  // Set the ref_frame_idx entries for kReferenceFrameLast and
+  // kReferenceFrameGolden to last_frame_idx and gold_frame_idx. Initialize
+  // the other entries to -1.
+  for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+    reference_frame_index = -1;
+  }
+  frame_header_
+      .reference_frame_index[kReferenceFrameLast - kReferenceFrameLast] =
+      last_frame_idx;
+  frame_header_
+      .reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast] =
+      gold_frame_idx;
+
+  // used_frame records which reference frames have been used.
+  std::array<bool, kNumReferenceFrameTypes> used_frame;
+  used_frame.fill(false);
+  used_frame[last_frame_idx] = true;
+  used_frame[gold_frame_idx] = true;
+
+  assert(sequence_header_.order_hint_bits >= 1);
+  const int current_frame_hint = 1 << (sequence_header_.order_hint_bits - 1);
+  // shifted_order_hints contains the expected output order shifted such that
+  // the current frame has hint equal to current_frame_hint.
+  std::array<int, kNumReferenceFrameTypes> shifted_order_hints;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int relative_distance = GetRelativeDistance(
+        decoder_state_.reference_order_hint[i], frame_header_.order_hint,
+        sequence_header_.order_hint_shift_bits);
+    shifted_order_hints[i] = current_frame_hint + relative_distance;
+  }
+
+  // The expected output orders for kReferenceFrameLast and
+  // kReferenceFrameGolden.
+  const int last_order_hint = shifted_order_hints[last_frame_idx];
+  const int gold_order_hint = shifted_order_hints[gold_frame_idx];
+
+  // Section 7.8: It is a requirement of bitstream conformance that
+  // lastOrderHint and goldOrderHint are strictly less than curFrameHint.
+  if (last_order_hint >= current_frame_hint ||
+      gold_order_hint >= current_frame_hint) {
+    return false;
+  }
+
+  // Find a backward reference to the frame with highest output order. If
+  // found, set the kReferenceFrameAlternate reference to that backward
+  // reference.
+  int ref = FindLatestBackwardReference(current_frame_hint, shifted_order_hints,
+                                        used_frame);
+  if (ref >= 0) {
+    frame_header_
+        .reference_frame_index[kReferenceFrameAlternate - kReferenceFrameLast] =
+        ref;
+    used_frame[ref] = true;
+  }
+
+  // Find a backward reference to the closest frame. If found, set the
+  // kReferenceFrameBackward reference to that backward reference.
+  ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+                                      used_frame);
+  if (ref >= 0) {
+    frame_header_
+        .reference_frame_index[kReferenceFrameBackward - kReferenceFrameLast] =
+        ref;
+    used_frame[ref] = true;
+  }
+
+  // Set the kReferenceFrameAlternate2 reference to the next closest backward
+  // reference.
+  ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+                                      used_frame);
+  if (ref >= 0) {
+    frame_header_.reference_frame_index[kReferenceFrameAlternate2 -
+                                        kReferenceFrameLast] = ref;
+    used_frame[ref] = true;
+  }
+
+  // The remaining references are set to be forward references in
+  // reverse chronological order.
+  static constexpr ReferenceFrameType
+      kRefFrameList[kNumInterReferenceFrameTypes - 2] = {
+          kReferenceFrameLast2, kReferenceFrameLast3, kReferenceFrameBackward,
+          kReferenceFrameAlternate2, kReferenceFrameAlternate};
+  for (const ReferenceFrameType ref_frame : kRefFrameList) {
+    if (frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] <
+        0) {
+      ref = FindLatestForwardReference(current_frame_hint, shifted_order_hints,
+                                       used_frame);
+      if (ref >= 0) {
+        frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] =
+            ref;
+        used_frame[ref] = true;
+      }
+    }
+  }
+
+  // Finally, any remaining references are set to the reference frame with
+  // smallest output order.
+  ref = FindReferenceWithSmallestOutputOrder(shifted_order_hints);
+  assert(ref >= 0);
+  for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+    if (reference_frame_index < 0) {
+      reference_frame_index = ref;
+    }
+  }
+
+  return true;
+}
+
+bool ObuParser::ParseLoopFilterParameters() {
+  LoopFilter* const loop_filter = &frame_header_.loop_filter;
+  if (frame_header_.coded_lossless || frame_header_.allow_intrabc) {
+    SetDefaultRefDeltas(loop_filter);
+    return true;
+  }
+  // IsIntraFrame implies kPrimaryReferenceNone.
+  assert(!IsIntraFrame(frame_header_.frame_type) ||
+         frame_header_.primary_reference_frame == kPrimaryReferenceNone);
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    // Part of the setup_past_independence() function in the spec. It is not
+    // necessary to set loop_filter->delta_enabled to true. See
+    // https://crbug.com/aomedia/2305.
+    SetDefaultRefDeltas(loop_filter);
+  } else {
+    // Part of the load_previous() function in the spec.
+    const int prev_frame_index =
+        frame_header_
+            .reference_frame_index[frame_header_.primary_reference_frame];
+    const RefCountedBuffer* prev_frame =
+        decoder_state_.reference_frame[prev_frame_index].get();
+    loop_filter->ref_deltas = prev_frame->loop_filter_ref_deltas();
+    loop_filter->mode_deltas = prev_frame->loop_filter_mode_deltas();
+  }
+  int64_t scratch;
+  for (int i = 0; i < 2; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(6);
+    loop_filter->level[i] = scratch;
+  }
+  if (!sequence_header_.color_config.is_monochrome &&
+      (loop_filter->level[0] != 0 || loop_filter->level[1] != 0)) {
+    for (int i = 2; i < 4; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(6);
+      loop_filter->level[i] = scratch;
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(3);
+  loop_filter->sharpness = scratch;
+  OBU_READ_BIT_OR_FAIL;
+  loop_filter->delta_enabled = scratch != 0;
+  if (loop_filter->delta_enabled) {
+    OBU_READ_BIT_OR_FAIL;
+    loop_filter->delta_update = scratch != 0;
+    if (loop_filter->delta_update) {
+      for (auto& ref_delta : loop_filter->ref_deltas) {
+        OBU_READ_BIT_OR_FAIL;
+        const bool update_ref_delta = scratch != 0;
+        if (update_ref_delta) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          ref_delta = scratch_int;
+        }
+      }
+      for (auto& mode_delta : loop_filter->mode_deltas) {
+        OBU_READ_BIT_OR_FAIL;
+        const bool update_mode_delta = scratch != 0;
+        if (update_mode_delta) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          mode_delta = scratch_int;
+        }
+      }
+    }
+  } else {
+    loop_filter->delta_update = false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseDeltaQuantizer(int8_t* const delta) {
+  int64_t scratch;
+  *delta = 0;
+  OBU_READ_BIT_OR_FAIL;
+  const bool delta_coded = scratch != 0;
+  if (delta_coded) {
+    int scratch_int;
+    if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits.");
+      return false;
+    }
+    *delta = scratch_int;
+  }
+  return true;
+}
+
+bool ObuParser::ParseQuantizerParameters() {
+  int64_t scratch;
+  QuantizerParameters* const quantizer = &frame_header_.quantizer;
+  OBU_READ_LITERAL_OR_FAIL(8);
+  quantizer->base_index = scratch;
+  if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneY])) return false;
+  if (!sequence_header_.color_config.is_monochrome) {
+    bool diff_uv_delta = false;
+    if (sequence_header_.color_config.separate_uv_delta_q) {
+      OBU_READ_BIT_OR_FAIL;
+      diff_uv_delta = scratch != 0;
+    }
+    if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneU]) ||
+        !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneU])) {
+      return false;
+    }
+    if (diff_uv_delta) {
+      if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneV]) ||
+          !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneV])) {
+        return false;
+      }
+    } else {
+      quantizer->delta_dc[kPlaneV] = quantizer->delta_dc[kPlaneU];
+      quantizer->delta_ac[kPlaneV] = quantizer->delta_ac[kPlaneU];
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  quantizer->use_matrix = scratch != 0;
+  if (quantizer->use_matrix) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    quantizer->matrix_level[kPlaneY] = scratch;
+    OBU_READ_LITERAL_OR_FAIL(4);
+    quantizer->matrix_level[kPlaneU] = scratch;
+    if (sequence_header_.color_config.separate_uv_delta_q) {
+      OBU_READ_LITERAL_OR_FAIL(4);
+      quantizer->matrix_level[kPlaneV] = scratch;
+    } else {
+      quantizer->matrix_level[kPlaneV] = quantizer->matrix_level[kPlaneU];
+    }
+  }
+  return true;
+}
+
+// This method implements the following functions in the spec:
+// - segmentation_params()
+// - part of setup_past_independence(): Set the FeatureData and FeatureEnabled
+//   arrays to all 0.
+// - part of load_previous(): Call load_segmentation_params().
+//
+// A careful analysis of the spec shows the part of setup_past_independence()
+// can be optimized away and the part of load_previous() only needs to be
+// invoked under a specific condition. Although the logic looks different from
+// the spec, it is equivalent and more efficient.
+bool ObuParser::ParseSegmentationParameters() {
+  int64_t scratch;
+  Segmentation* const segmentation = &frame_header_.segmentation;
+  OBU_READ_BIT_OR_FAIL;
+  segmentation->enabled = scratch != 0;
+  if (!segmentation->enabled) return true;
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    segmentation->update_map = true;
+    segmentation->update_data = true;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    segmentation->update_map = scratch != 0;
+    if (segmentation->update_map) {
+      OBU_READ_BIT_OR_FAIL;
+      segmentation->temporal_update = scratch != 0;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    segmentation->update_data = scratch != 0;
+    if (!segmentation->update_data) {
+      // Part of the load_previous() function in the spec.
+      const int prev_frame_index =
+          frame_header_
+              .reference_frame_index[frame_header_.primary_reference_frame];
+      decoder_state_.reference_frame[prev_frame_index]
+          ->GetSegmentationParameters(segmentation);
+      return true;
+    }
+  }
+  for (int8_t i = 0; i < kMaxSegments; ++i) {
+    for (int8_t j = 0; j < kSegmentFeatureMax; ++j) {
+      OBU_READ_BIT_OR_FAIL;
+      segmentation->feature_enabled[i][j] = scratch != 0;
+      if (segmentation->feature_enabled[i][j]) {
+        if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(
+                  kSegmentationFeatureBits[j], &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          segmentation->feature_data[i][j] =
+              Clip3(scratch_int, -kSegmentationFeatureMaxValues[j],
+                    kSegmentationFeatureMaxValues[j]);
+        } else {
+          if (kSegmentationFeatureBits[j] > 0) {
+            OBU_READ_LITERAL_OR_FAIL(kSegmentationFeatureBits[j]);
+            segmentation->feature_data[i][j] = Clip3(
+                static_cast<int>(scratch), 0, kSegmentationFeatureMaxValues[j]);
+          } else {
+            segmentation->feature_data[i][j] = 0;
+          }
+        }
+        segmentation->last_active_segment_id = i;
+        if (j >= kSegmentFeatureReferenceFrame) {
+          segmentation->segment_id_pre_skip = true;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseQuantizerIndexDeltaParameters() {
+  int64_t scratch;
+  if (frame_header_.quantizer.base_index > 0) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.delta_q.present = scratch != 0;
+    if (frame_header_.delta_q.present) {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.delta_q.scale = scratch;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseLoopFilterDeltaParameters() {
+  int64_t scratch;
+  if (frame_header_.delta_q.present) {
+    if (!frame_header_.allow_intrabc) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.delta_lf.present = scratch != 0;
+    }
+    if (frame_header_.delta_lf.present) {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.delta_lf.scale = scratch;
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.delta_lf.multi = scratch != 0;
+    }
+  }
+  return true;
+}
+
+void ObuParser::ComputeSegmentLosslessAndQIndex() {
+  frame_header_.coded_lossless = true;
+  Segmentation* const segmentation = &frame_header_.segmentation;
+  const QuantizerParameters* const quantizer = &frame_header_.quantizer;
+  for (int i = 0; i < kMaxSegments; ++i) {
+    segmentation->qindex[i] =
+        GetQIndex(*segmentation, i, quantizer->base_index);
+    segmentation->lossless[i] =
+        segmentation->qindex[i] == 0 && quantizer->delta_dc[kPlaneY] == 0 &&
+        quantizer->delta_dc[kPlaneU] == 0 &&
+        quantizer->delta_ac[kPlaneU] == 0 &&
+        quantizer->delta_dc[kPlaneV] == 0 && quantizer->delta_ac[kPlaneV] == 0;
+    if (!segmentation->lossless[i]) frame_header_.coded_lossless = false;
+    // The spec calls for setting up a two-dimensional SegQMLevel array here.
+    // We avoid the SegQMLevel array by using segmentation->lossless[i] and
+    // quantizer->matrix_level[plane] directly in the reconstruct process of
+    // Section 7.12.3.
+  }
+  frame_header_.upscaled_lossless =
+      frame_header_.coded_lossless &&
+      frame_header_.width == frame_header_.upscaled_width;
+}
+
+bool ObuParser::ParseCdefParameters() {
+  const int coeff_shift = sequence_header_.color_config.bitdepth - 8;
+  if (frame_header_.coded_lossless || frame_header_.allow_intrabc ||
+      !sequence_header_.enable_cdef) {
+    frame_header_.cdef.damping = 3 + coeff_shift;
+    return true;
+  }
+  Cdef* const cdef = &frame_header_.cdef;
+  int64_t scratch;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  cdef->damping = scratch + 3 + coeff_shift;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  cdef->bits = scratch;
+  for (int i = 0; i < (1 << cdef->bits); ++i) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    cdef->y_primary_strength[i] = scratch << coeff_shift;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    cdef->y_secondary_strength[i] = scratch;
+    if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i];
+    cdef->y_secondary_strength[i] <<= coeff_shift;
+    if (sequence_header_.color_config.is_monochrome) continue;
+    OBU_READ_LITERAL_OR_FAIL(4);
+    cdef->uv_primary_strength[i] = scratch << coeff_shift;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    cdef->uv_secondary_strength[i] = scratch;
+    if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i];
+    cdef->uv_secondary_strength[i] <<= coeff_shift;
+  }
+  return true;
+}
+
+bool ObuParser::ParseLoopRestorationParameters() {
+  if (frame_header_.upscaled_lossless || frame_header_.allow_intrabc ||
+      !sequence_header_.enable_restoration) {
+    return true;
+  }
+  int64_t scratch;
+  bool uses_loop_restoration = false;
+  bool uses_chroma_loop_restoration = false;
+  LoopRestoration* const loop_restoration = &frame_header_.loop_restoration;
+  const int num_planes = sequence_header_.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  for (int i = 0; i < num_planes; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(2);
+    loop_restoration->type[i] = static_cast<LoopRestorationType>(scratch);
+    if (loop_restoration->type[i] != kLoopRestorationTypeNone) {
+      uses_loop_restoration = true;
+      if (i > 0) uses_chroma_loop_restoration = true;
+    }
+  }
+  if (uses_loop_restoration) {
+    uint8_t unit_shift;
+    if (sequence_header_.use_128x128_superblock) {
+      OBU_READ_BIT_OR_FAIL;
+      unit_shift = scratch + 1;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      unit_shift = scratch;
+      if (unit_shift != 0) {
+        OBU_READ_BIT_OR_FAIL;
+        const uint8_t unit_extra_shift = scratch;
+        unit_shift += unit_extra_shift;
+      }
+    }
+    loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift;
+    uint8_t uv_shift = 0;
+    if (sequence_header_.color_config.subsampling_x != 0 &&
+        sequence_header_.color_config.subsampling_y != 0 &&
+        uses_chroma_loop_restoration) {
+      OBU_READ_BIT_OR_FAIL;
+      uv_shift = scratch;
+    }
+    loop_restoration->unit_size_log2[kPlaneU] =
+        loop_restoration->unit_size_log2[kPlaneV] =
+            loop_restoration->unit_size_log2[0] - uv_shift;
+  }
+  return true;
+}
+
+bool ObuParser::ParseTxModeSyntax() {
+  if (frame_header_.coded_lossless) {
+    frame_header_.tx_mode = kTxModeOnly4x4;
+    return true;
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.tx_mode = (scratch == 1) ? kTxModeSelect : kTxModeLargest;
+  return true;
+}
+
+bool ObuParser::ParseFrameReferenceModeSyntax() {
+  int64_t scratch;
+  if (!IsIntraFrame(frame_header_.frame_type)) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.reference_mode_select = scratch != 0;
+  }
+  return true;
+}
+
+bool ObuParser::IsSkipModeAllowed() {
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      !frame_header_.reference_mode_select ||
+      !sequence_header_.enable_order_hint) {
+    return false;
+  }
+  // Identify the nearest forward and backward references.
+  int forward_index = -1;
+  int backward_index = -1;
+  int forward_hint = -1;
+  int backward_hint = -1;
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    const unsigned int reference_hint =
+        decoder_state_
+            .reference_order_hint[frame_header_.reference_frame_index[i]];
+    // TODO(linfengz): |relative_distance| equals
+    // current_frame_->reference_info()->
+    //     relative_distance_from[i + kReferenceFrameLast];
+    // However, the unit test ObuParserTest.SkipModeParameters() would fail.
+    // Will figure out how to initialize |current_frame_.reference_info_| in the
+    // RefCountedBuffer later.
+    const int relative_distance =
+        GetRelativeDistance(reference_hint, frame_header_.order_hint,
+                            sequence_header_.order_hint_shift_bits);
+    if (relative_distance < 0) {
+      if (forward_index < 0 ||
+          GetRelativeDistance(reference_hint, forward_hint,
+                              sequence_header_.order_hint_shift_bits) > 0) {
+        forward_index = i;
+        forward_hint = reference_hint;
+      }
+    } else if (relative_distance > 0) {
+      if (backward_index < 0 ||
+          GetRelativeDistance(reference_hint, backward_hint,
+                              sequence_header_.order_hint_shift_bits) < 0) {
+        backward_index = i;
+        backward_hint = reference_hint;
+      }
+    }
+  }
+  if (forward_index < 0) return false;
+  if (backward_index >= 0) {
+    // Bidirectional prediction.
+    frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+        kReferenceFrameLast + std::min(forward_index, backward_index));
+    frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+        kReferenceFrameLast + std::max(forward_index, backward_index));
+    return true;
+  }
+  // Forward prediction only. Identify the second nearest forward reference.
+  int second_forward_index = -1;
+  int second_forward_hint = -1;
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    const unsigned int reference_hint =
+        decoder_state_
+            .reference_order_hint[frame_header_.reference_frame_index[i]];
+    if (GetRelativeDistance(reference_hint, forward_hint,
+                            sequence_header_.order_hint_shift_bits) < 0) {
+      if (second_forward_index < 0 ||
+          GetRelativeDistance(reference_hint, second_forward_hint,
+                              sequence_header_.order_hint_shift_bits) > 0) {
+        second_forward_index = i;
+        second_forward_hint = reference_hint;
+      }
+    }
+  }
+  if (second_forward_index < 0) return false;
+  frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+      kReferenceFrameLast + std::min(forward_index, second_forward_index));
+  frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+      kReferenceFrameLast + std::max(forward_index, second_forward_index));
+  return true;
+}
+
+bool ObuParser::ParseSkipModeParameters() {
+  if (!IsSkipModeAllowed()) return true;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.skip_mode_present = scratch != 0;
+  return true;
+}
+
+// Sets frame_header_.global_motion[ref].params[index].
+bool ObuParser::ParseGlobalParamSyntax(
+    int ref, int index,
+    const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+        prev_global_motions) {
+  GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+  const GlobalMotion* const prev_global_motion = &prev_global_motions[ref];
+  int abs_bits = kGlobalMotionAlphaBits;
+  int precision_bits = kGlobalMotionAlphaPrecisionBits;
+  if (index < 2) {
+    if (global_motion->type == kGlobalMotionTransformationTypeTranslation) {
+      const auto high_precision_mv_factor =
+          static_cast<int>(!frame_header_.allow_high_precision_mv);
+      abs_bits = kGlobalMotionTranslationOnlyBits - high_precision_mv_factor;
+      precision_bits =
+          kGlobalMotionTranslationOnlyPrecisionBits - high_precision_mv_factor;
+    } else {
+      abs_bits = kGlobalMotionTranslationBits;
+      precision_bits = kGlobalMotionTranslationPrecisionBits;
+    }
+  }
+  const int precision_diff = kWarpedModelPrecisionBits - precision_bits;
+  const int round = (index % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+  const int sub = (index % 3 == 2) ? 1 << precision_bits : 0;
+  const int mx = 1 << abs_bits;
+  const int reference =
+      (prev_global_motion->params[index] >> precision_diff) - sub;
+  int scratch;
+  if (!bit_reader_->DecodeSignedSubexpWithReference(
+          -mx, mx + 1, reference, kGlobalMotionReadControl, &scratch)) {
+    LIBGAV1_DLOG(ERROR, "Not enough bits.");
+    return false;
+  }
+  global_motion->params[index] = LeftShift(scratch, precision_diff) + round;
+  return true;
+}
+
+bool ObuParser::ParseGlobalMotionParameters() {
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    frame_header_.global_motion[ref].type =
+        kGlobalMotionTransformationTypeIdentity;
+    for (int i = 0; i < 6; ++i) {
+      frame_header_.global_motion[ref].params[i] =
+          (i % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+    }
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) return true;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>* prev_global_motions =
+      nullptr;
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    // Part of the setup_past_independence() function in the spec. The value
+    // that the spec says PrevGmParams[ref][i] should be set to is exactly
+    // the value frame_header_.global_motion[ref].params[i] is set to by the
+    // for loop above. Therefore prev_global_motions can simply point to
+    // frame_header_.global_motion.
+    prev_global_motions = &frame_header_.global_motion;
+  } else {
+    // Part of the load_previous() function in the spec.
+    const int prev_frame_index =
+        frame_header_
+            .reference_frame_index[frame_header_.primary_reference_frame];
+    prev_global_motions =
+        &decoder_state_.reference_frame[prev_frame_index]->GlobalMotions();
+  }
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+    int64_t scratch;
+    OBU_READ_BIT_OR_FAIL;
+    const bool is_global = scratch != 0;
+    if (is_global) {
+      OBU_READ_BIT_OR_FAIL;
+      const bool is_rot_zoom = scratch != 0;
+      if (is_rot_zoom) {
+        global_motion->type = kGlobalMotionTransformationTypeRotZoom;
+      } else {
+        OBU_READ_BIT_OR_FAIL;
+        const bool is_translation = scratch != 0;
+        global_motion->type = is_translation
+                                  ? kGlobalMotionTransformationTypeTranslation
+                                  : kGlobalMotionTransformationTypeAffine;
+      }
+    } else {
+      global_motion->type = kGlobalMotionTransformationTypeIdentity;
+    }
+    if (global_motion->type >= kGlobalMotionTransformationTypeRotZoom) {
+      if (!ParseGlobalParamSyntax(ref, 2, *prev_global_motions) ||
+          !ParseGlobalParamSyntax(ref, 3, *prev_global_motions)) {
+        return false;
+      }
+      if (global_motion->type == kGlobalMotionTransformationTypeAffine) {
+        if (!ParseGlobalParamSyntax(ref, 4, *prev_global_motions) ||
+            !ParseGlobalParamSyntax(ref, 5, *prev_global_motions)) {
+          return false;
+        }
+      } else {
+        global_motion->params[4] = -global_motion->params[3];
+        global_motion->params[5] = global_motion->params[2];
+      }
+    }
+    if (global_motion->type >= kGlobalMotionTransformationTypeTranslation) {
+      if (!ParseGlobalParamSyntax(ref, 0, *prev_global_motions) ||
+          !ParseGlobalParamSyntax(ref, 1, *prev_global_motions)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseFilmGrainParameters() {
+  if (!sequence_header_.film_grain_params_present ||
+      (!frame_header_.show_frame && !frame_header_.showable_frame)) {
+    // frame_header_.film_grain_params is already zero-initialized.
+    return true;
+  }
+
+  FilmGrainParams& film_grain_params = frame_header_.film_grain_params;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.apply_grain = scratch != 0;
+  if (!film_grain_params.apply_grain) {
+    // film_grain_params is already zero-initialized.
+    return true;
+  }
+
+  OBU_READ_LITERAL_OR_FAIL(16);
+  film_grain_params.grain_seed = static_cast<int>(scratch);
+  film_grain_params.update_grain = true;
+  if (frame_header_.frame_type == kFrameInter) {
+    OBU_READ_BIT_OR_FAIL;
+    film_grain_params.update_grain = scratch != 0;
+  }
+  if (!film_grain_params.update_grain) {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    film_grain_params.reference_index = static_cast<int>(scratch);
+    bool found = false;
+    for (const auto index : frame_header_.reference_frame_index) {
+      if (film_grain_params.reference_index == index) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      static_assert(sizeof(frame_header_.reference_frame_index) /
+                            sizeof(frame_header_.reference_frame_index[0]) ==
+                        7,
+                    "");
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid value for film_grain_params_ref_idx (%d). "
+                   "ref_frame_idx = {%d, %d, %d, %d, %d, %d, %d}",
+                   film_grain_params.reference_index,
+                   frame_header_.reference_frame_index[0],
+                   frame_header_.reference_frame_index[1],
+                   frame_header_.reference_frame_index[2],
+                   frame_header_.reference_frame_index[3],
+                   frame_header_.reference_frame_index[4],
+                   frame_header_.reference_frame_index[5],
+                   frame_header_.reference_frame_index[6]);
+      return false;
+    }
+    const RefCountedBuffer* grain_params_reference_frame =
+        decoder_state_.reference_frame[film_grain_params.reference_index].get();
+    if (grain_params_reference_frame == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+                   film_grain_params.reference_index);
+      return false;
+    }
+    const int temp_grain_seed = film_grain_params.grain_seed;
+    const bool temp_update_grain = film_grain_params.update_grain;
+    const int temp_reference_index = film_grain_params.reference_index;
+    film_grain_params = grain_params_reference_frame->film_grain_params();
+    film_grain_params.grain_seed = temp_grain_seed;
+    film_grain_params.update_grain = temp_update_grain;
+    film_grain_params.reference_index = temp_reference_index;
+    return true;
+  }
+
+  OBU_READ_LITERAL_OR_FAIL(4);
+  film_grain_params.num_y_points = scratch;
+  if (film_grain_params.num_y_points > 14) {
+    LIBGAV1_DLOG(ERROR, "Invalid value for num_y_points (%d).",
+                 film_grain_params.num_y_points);
+    return false;
+  }
+  for (int i = 0; i < film_grain_params.num_y_points; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.point_y_value[i] = scratch;
+    if (i != 0 && film_grain_params.point_y_value[i - 1] >=
+                      film_grain_params.point_y_value[i]) {
+      LIBGAV1_DLOG(ERROR, "point_y_value[%d] (%d) >= point_y_value[%d] (%d).",
+                   i - 1, film_grain_params.point_y_value[i - 1], i,
+                   film_grain_params.point_y_value[i]);
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.point_y_scaling[i] = scratch;
+  }
+  if (sequence_header_.color_config.is_monochrome) {
+    film_grain_params.chroma_scaling_from_luma = false;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    film_grain_params.chroma_scaling_from_luma = scratch != 0;
+  }
+  if (sequence_header_.color_config.is_monochrome ||
+      film_grain_params.chroma_scaling_from_luma ||
+      (sequence_header_.color_config.subsampling_x == 1 &&
+       sequence_header_.color_config.subsampling_y == 1 &&
+       film_grain_params.num_y_points == 0)) {
+    film_grain_params.num_u_points = 0;
+    film_grain_params.num_v_points = 0;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    film_grain_params.num_u_points = scratch;
+    if (film_grain_params.num_u_points > 10) {
+      LIBGAV1_DLOG(ERROR, "Invalid value for num_u_points (%d).",
+                   film_grain_params.num_u_points);
+      return false;
+    }
+    for (int i = 0; i < film_grain_params.num_u_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_u_value[i] = scratch;
+      if (i != 0 && film_grain_params.point_u_value[i - 1] >=
+                        film_grain_params.point_u_value[i]) {
+        LIBGAV1_DLOG(ERROR, "point_u_value[%d] (%d) >= point_u_value[%d] (%d).",
+                     i - 1, film_grain_params.point_u_value[i - 1], i,
+                     film_grain_params.point_u_value[i]);
+        return false;
+      }
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_u_scaling[i] = scratch;
+    }
+    OBU_READ_LITERAL_OR_FAIL(4);
+    film_grain_params.num_v_points = scratch;
+    if (film_grain_params.num_v_points > 10) {
+      LIBGAV1_DLOG(ERROR, "Invalid value for num_v_points (%d).",
+                   film_grain_params.num_v_points);
+      return false;
+    }
+    if (sequence_header_.color_config.subsampling_x == 1 &&
+        sequence_header_.color_config.subsampling_y == 1 &&
+        (film_grain_params.num_u_points == 0) !=
+            (film_grain_params.num_v_points == 0)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid values for num_u_points (%d) and num_v_points (%d) "
+                   "for 4:2:0 chroma subsampling.",
+                   film_grain_params.num_u_points,
+                   film_grain_params.num_v_points);
+      return false;
+    }
+    for (int i = 0; i < film_grain_params.num_v_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_v_value[i] = scratch;
+      if (i != 0 && film_grain_params.point_v_value[i - 1] >=
+                        film_grain_params.point_v_value[i]) {
+        LIBGAV1_DLOG(ERROR, "point_v_value[%d] (%d) >= point_v_value[%d] (%d).",
+                     i - 1, film_grain_params.point_v_value[i - 1], i,
+                     film_grain_params.point_v_value[i]);
+        return false;
+      }
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_v_scaling[i] = scratch;
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.chroma_scaling = scratch + 8;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.auto_regression_coeff_lag = scratch;
+
+  const int num_pos_y =
+      MultiplyBy2(film_grain_params.auto_regression_coeff_lag) *
+      (film_grain_params.auto_regression_coeff_lag + 1);
+  int num_pos_uv = num_pos_y;
+  if (film_grain_params.num_y_points > 0) {
+    ++num_pos_uv;
+    for (int i = 0; i < num_pos_y; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_y[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  if (film_grain_params.chroma_scaling_from_luma ||
+      film_grain_params.num_u_points > 0) {
+    for (int i = 0; i < num_pos_uv; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_u[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  if (film_grain_params.chroma_scaling_from_luma ||
+      film_grain_params.num_v_points > 0) {
+    for (int i = 0; i < num_pos_uv; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_v[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.auto_regression_shift = static_cast<uint8_t>(scratch + 6);
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.grain_scale_shift = static_cast<int>(scratch);
+  if (film_grain_params.num_u_points > 0) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.u_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.u_luma_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(9);
+    film_grain_params.u_offset = static_cast<int16_t>(scratch - 256);
+  }
+  if (film_grain_params.num_v_points > 0) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.v_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.v_luma_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(9);
+    film_grain_params.v_offset = static_cast<int16_t>(scratch - 256);
+  }
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.overlap_flag = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.clip_to_restricted_range = scratch != 0;
+  return true;
+}
+
+bool ObuParser::ParseTileInfoSyntax() {
+  TileInfo* const tile_info = &frame_header_.tile_info;
+  const int sb_columns = sequence_header_.use_128x128_superblock
+                             ? ((frame_header_.columns4x4 + 31) >> 5)
+                             : ((frame_header_.columns4x4 + 15) >> 4);
+  const int sb_rows = sequence_header_.use_128x128_superblock
+                          ? ((frame_header_.rows4x4 + 31) >> 5)
+                          : ((frame_header_.rows4x4 + 15) >> 4);
+  tile_info->sb_columns = sb_columns;
+  tile_info->sb_rows = sb_rows;
+  const int sb_shift = sequence_header_.use_128x128_superblock ? 5 : 4;
+  const int sb_size = 2 + sb_shift;
+  const int sb_max_tile_width = kMaxTileWidth >> sb_size;
+  const int sb_max_tile_area = kMaxTileArea >> MultiplyBy2(sb_size);
+  const int minlog2_tile_columns = TileLog2(sb_max_tile_width, sb_columns);
+  const int maxlog2_tile_columns =
+      CeilLog2(std::min(sb_columns, static_cast<int>(kMaxTileColumns)));
+  const int maxlog2_tile_rows =
+      CeilLog2(std::min(sb_rows, static_cast<int>(kMaxTileRows)));
+  const int min_log2_tiles = std::max(
+      minlog2_tile_columns, TileLog2(sb_max_tile_area, sb_rows * sb_columns));
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  tile_info->uniform_spacing = scratch != 0;
+  if (tile_info->uniform_spacing) {
+    // Read tile columns.
+    tile_info->tile_columns_log2 = minlog2_tile_columns;
+    while (tile_info->tile_columns_log2 < maxlog2_tile_columns) {
+      OBU_READ_BIT_OR_FAIL;
+      if (scratch == 0) break;
+      ++tile_info->tile_columns_log2;
+    }
+
+    // Compute tile column starts.
+    const int sb_tile_width =
+        (sb_columns + (1 << tile_info->tile_columns_log2) - 1) >>
+        tile_info->tile_columns_log2;
+    if (sb_tile_width <= 0) return false;
+    int i = 0;
+    for (int sb_start = 0; sb_start < sb_columns; sb_start += sb_tile_width) {
+      if (i >= kMaxTileColumns) {
+        LIBGAV1_DLOG(ERROR,
+                     "tile_columns would be greater than kMaxTileColumns.");
+        return false;
+      }
+      tile_info->tile_column_start[i++] = sb_start << sb_shift;
+    }
+    tile_info->tile_column_start[i] = frame_header_.columns4x4;
+    tile_info->tile_columns = i;
+
+    // Read tile rows.
+    const int minlog2_tile_rows =
+        std::max(min_log2_tiles - tile_info->tile_columns_log2, 0);
+    tile_info->tile_rows_log2 = minlog2_tile_rows;
+    while (tile_info->tile_rows_log2 < maxlog2_tile_rows) {
+      OBU_READ_BIT_OR_FAIL;
+      if (scratch == 0) break;
+      ++tile_info->tile_rows_log2;
+    }
+
+    // Compute tile row starts.
+    const int sb_tile_height =
+        (sb_rows + (1 << tile_info->tile_rows_log2) - 1) >>
+        tile_info->tile_rows_log2;
+    if (sb_tile_height <= 0) return false;
+    i = 0;
+    for (int sb_start = 0; sb_start < sb_rows; sb_start += sb_tile_height) {
+      if (i >= kMaxTileRows) {
+        LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+        return false;
+      }
+      tile_info->tile_row_start[i++] = sb_start << sb_shift;
+    }
+    tile_info->tile_row_start[i] = frame_header_.rows4x4;
+    tile_info->tile_rows = i;
+  } else {
+    int widest_tile_sb = 1;
+    int i = 0;
+    for (int sb_start = 0; sb_start < sb_columns; ++i) {
+      if (i >= kMaxTileColumns) {
+        LIBGAV1_DLOG(ERROR,
+                     "tile_columns would be greater than kMaxTileColumns.");
+        return false;
+      }
+      tile_info->tile_column_start[i] = sb_start << sb_shift;
+      const int max_width =
+          std::min(sb_columns - sb_start, static_cast<int>(sb_max_tile_width));
+      if (!bit_reader_->DecodeUniform(
+              max_width, &tile_info->tile_column_width_in_superblocks[i])) {
+        LIBGAV1_DLOG(ERROR, "Not enough bits.");
+        return false;
+      }
+      ++tile_info->tile_column_width_in_superblocks[i];
+      widest_tile_sb = std::max(tile_info->tile_column_width_in_superblocks[i],
+                                widest_tile_sb);
+      sb_start += tile_info->tile_column_width_in_superblocks[i];
+    }
+    tile_info->tile_column_start[i] = frame_header_.columns4x4;
+    tile_info->tile_columns = i;
+    tile_info->tile_columns_log2 = CeilLog2(tile_info->tile_columns);
+
+    int max_tile_area_sb = sb_rows * sb_columns;
+    if (min_log2_tiles > 0) max_tile_area_sb >>= min_log2_tiles + 1;
+    const int max_tile_height_sb =
+        std::max(max_tile_area_sb / widest_tile_sb, 1);
+
+    i = 0;
+    for (int sb_start = 0; sb_start < sb_rows; ++i) {
+      if (i >= kMaxTileRows) {
+        LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+        return false;
+      }
+      tile_info->tile_row_start[i] = sb_start << sb_shift;
+      const int max_height = std::min(sb_rows - sb_start, max_tile_height_sb);
+      if (!bit_reader_->DecodeUniform(
+              max_height, &tile_info->tile_row_height_in_superblocks[i])) {
+        LIBGAV1_DLOG(ERROR, "Not enough bits.");
+        return false;
+      }
+      ++tile_info->tile_row_height_in_superblocks[i];
+      sb_start += tile_info->tile_row_height_in_superblocks[i];
+    }
+    tile_info->tile_row_start[i] = frame_header_.rows4x4;
+    tile_info->tile_rows = i;
+    tile_info->tile_rows_log2 = CeilLog2(tile_info->tile_rows);
+  }
+  tile_info->tile_count = tile_info->tile_rows * tile_info->tile_columns;
+  if (!tile_buffers_.reserve(tile_info->tile_count)) {
+    LIBGAV1_DLOG(ERROR, "Unable to allocate memory for tile_buffers_.");
+    return false;
+  }
+  tile_info->context_update_id = 0;
+  const int tile_bits =
+      tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+  if (tile_bits != 0) {
+    OBU_READ_LITERAL_OR_FAIL(tile_bits);
+    tile_info->context_update_id = static_cast<int16_t>(scratch);
+    if (tile_info->context_update_id >= tile_info->tile_count) {
+      LIBGAV1_DLOG(ERROR, "Invalid context_update_tile_id (%d) >= %d.",
+                   tile_info->context_update_id, tile_info->tile_count);
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(2);
+    tile_info->tile_size_bytes = 1 + scratch;
+  }
+  return true;
+}
+
+bool ObuParser::ReadAllowWarpedMotion() {
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      frame_header_.error_resilient_mode ||
+      !sequence_header_.enable_warped_motion) {
+    return true;
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.allow_warped_motion = scratch != 0;
+  return true;
+}
+
+bool ObuParser::ParseFrameParameters() {
+  int64_t scratch;
+  if (sequence_header_.reduced_still_picture_header) {
+    frame_header_.show_frame = true;
+    if (!EnsureCurrentFrameIsNotNull()) return false;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.show_existing_frame = scratch != 0;
+    if (frame_header_.show_existing_frame) {
+      OBU_READ_LITERAL_OR_FAIL(3);
+      frame_header_.frame_to_show = scratch;
+      if (sequence_header_.decoder_model_info_present_flag &&
+          !sequence_header_.timing_info.equal_picture_interval) {
+        OBU_READ_LITERAL_OR_FAIL(
+            sequence_header_.decoder_model_info.frame_presentation_time_length);
+        frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+      }
+      if (sequence_header_.frame_id_numbers_present) {
+        OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+        frame_header_.display_frame_id = static_cast<uint16_t>(scratch);
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // whenever display_frame_id is read, the value matches
+        // RefFrameId[ frame_to_show_map_idx ] ..., and that
+        // RefValid[ frame_to_show_map_idx ] is equal to 1.
+        //
+        // The current_frame_ == nullptr check below is equivalent to checking
+        // if RefValid[ frame_to_show_map_idx ] is equal to 1.
+        if (frame_header_.display_frame_id !=
+            decoder_state_.reference_frame_id[frame_header_.frame_to_show]) {
+          LIBGAV1_DLOG(ERROR,
+                       "Reference buffer %d has a frame id number mismatch.",
+                       frame_header_.frame_to_show);
+          return false;
+        }
+      }
+      // Section 7.18.2. Note: This is also needed for Section 7.21 if
+      // frame_type is kFrameKey.
+      current_frame_ =
+          decoder_state_.reference_frame[frame_header_.frame_to_show];
+      if (current_frame_ == nullptr) {
+        LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+                     frame_header_.frame_to_show);
+        return false;
+      }
+      // Section 6.8.2: It is a requirement of bitstream conformance that
+      // when show_existing_frame is used to show a previous frame, that the
+      // value of showable_frame for the previous frame was equal to 1.
+      if (!current_frame_->showable_frame()) {
+        LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a showable frame",
+                     frame_header_.frame_to_show);
+        return false;
+      }
+      if (current_frame_->frame_type() == kFrameKey) {
+        frame_header_.refresh_frame_flags = 0xff;
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // when show_existing_frame is used to show a previous frame with
+        // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that
+        // the frame is output via the show_existing_frame mechanism at most
+        // once.
+        current_frame_->set_showable_frame(false);
+
+        // Section 7.21. Note: decoder_state_.current_frame_id must be set
+        // only when frame_type is kFrameKey per the spec. Among all the
+        // variables set in Section 7.21, current_frame_id is the only one
+        // whose value lives across frames. (PrevFrameID is set equal to the
+        // current_frame_id value for the previous frame.)
+        decoder_state_.current_frame_id =
+            decoder_state_.reference_frame_id[frame_header_.frame_to_show];
+        decoder_state_.order_hint =
+            decoder_state_.reference_order_hint[frame_header_.frame_to_show];
+      }
+      return true;
+    }
+    if (!EnsureCurrentFrameIsNotNull()) return false;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    frame_header_.frame_type = static_cast<FrameType>(scratch);
+    current_frame_->set_frame_type(frame_header_.frame_type);
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.show_frame = scratch != 0;
+    if (frame_header_.show_frame &&
+        sequence_header_.decoder_model_info_present_flag &&
+        !sequence_header_.timing_info.equal_picture_interval) {
+      OBU_READ_LITERAL_OR_FAIL(
+          sequence_header_.decoder_model_info.frame_presentation_time_length);
+      frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+    }
+    if (frame_header_.show_frame) {
+      frame_header_.showable_frame = (frame_header_.frame_type != kFrameKey);
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.showable_frame = scratch != 0;
+    }
+    current_frame_->set_showable_frame(frame_header_.showable_frame);
+    if (frame_header_.frame_type == kFrameSwitch ||
+        (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+      frame_header_.error_resilient_mode = true;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.error_resilient_mode = scratch != 0;
+    }
+  }
+  if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
+    decoder_state_.reference_order_hint.fill(0);
+    decoder_state_.reference_frame.fill(nullptr);
+  }
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.enable_cdf_update = scratch == 0;
+  if (sequence_header_.force_screen_content_tools ==
+      kSelectScreenContentTools) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.allow_screen_content_tools = scratch != 0;
+  } else {
+    frame_header_.allow_screen_content_tools =
+        sequence_header_.force_screen_content_tools != 0;
+  }
+  if (frame_header_.allow_screen_content_tools) {
+    if (sequence_header_.force_integer_mv == kSelectIntegerMv) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.force_integer_mv = scratch;
+    } else {
+      frame_header_.force_integer_mv = sequence_header_.force_integer_mv;
+    }
+  } else {
+    frame_header_.force_integer_mv = 0;
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) {
+    frame_header_.force_integer_mv = 1;
+  }
+  if (sequence_header_.frame_id_numbers_present) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+    frame_header_.current_frame_id = static_cast<uint16_t>(scratch);
+    const int previous_frame_id = decoder_state_.current_frame_id;
+    decoder_state_.current_frame_id = frame_header_.current_frame_id;
+    if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) {
+      if (previous_frame_id >= 0) {
+        // Section 6.8.2: ..., it is a requirement of bitstream conformance
+        // that all of the following conditions are true:
+        //   * current_frame_id is not equal to PrevFrameID,
+        //   * DiffFrameID is less than 1 << ( idLen - 1 )
+        int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
+        const int id_length_max_value =
+            1 << sequence_header_.frame_id_length_bits;
+        if (diff_frame_id <= 0) {
+          diff_frame_id += id_length_max_value;
+        }
+        if (diff_frame_id >= DivideBy2(id_length_max_value)) {
+          LIBGAV1_DLOG(ERROR,
+                       "current_frame_id (%d) equals or differs too much from "
+                       "previous_frame_id (%d).",
+                       decoder_state_.current_frame_id, previous_frame_id);
+          return false;
+        }
+      }
+      MarkInvalidReferenceFrames();
+    }
+  } else {
+    frame_header_.current_frame_id = 0;
+    decoder_state_.current_frame_id = frame_header_.current_frame_id;
+  }
+  if (frame_header_.frame_type == kFrameSwitch) {
+    frame_header_.frame_size_override_flag = true;
+  } else if (!sequence_header_.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.frame_size_override_flag = scratch != 0;
+  }
+  if (sequence_header_.order_hint_bits > 0) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+    frame_header_.order_hint = scratch;
+  }
+  decoder_state_.order_hint = frame_header_.order_hint;
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      frame_header_.error_resilient_mode) {
+    frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    frame_header_.primary_reference_frame = scratch;
+  }
+  if (sequence_header_.decoder_model_info_present_flag) {
+    OBU_READ_BIT_OR_FAIL;
+    const bool buffer_removal_time_present = scratch != 0;
+    if (buffer_removal_time_present) {
+      for (int i = 0; i < sequence_header_.operating_points; ++i) {
+        if (!sequence_header_.decoder_model_present_for_operating_point[i]) {
+          continue;
+        }
+        const int index = sequence_header_.operating_point_idc[i];
+        if (index == 0 ||
+            (InTemporalLayer(index, obu_headers_.back().temporal_id) &&
+             InSpatialLayer(index, obu_headers_.back().spatial_id))) {
+          OBU_READ_LITERAL_OR_FAIL(
+              sequence_header_.decoder_model_info.buffer_removal_time_length);
+          frame_header_.buffer_removal_time[i] = static_cast<uint32_t>(scratch);
+        }
+      }
+    }
+  }
+  if (frame_header_.frame_type == kFrameSwitch ||
+      (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+    frame_header_.refresh_frame_flags = 0xff;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    frame_header_.refresh_frame_flags = scratch;
+    // Section 6.8.2: If frame_type is equal to INTRA_ONLY_FRAME, it is a
+    // requirement of bitstream conformance that refresh_frame_flags is not
+    // equal to 0xff.
+    if (frame_header_.frame_type == kFrameIntraOnly &&
+        frame_header_.refresh_frame_flags == 0xff) {
+      LIBGAV1_DLOG(ERROR, "Intra only frames cannot have refresh flags 0xFF.");
+      return false;
+    }
+  }
+  if ((!IsIntraFrame(frame_header_.frame_type) ||
+       frame_header_.refresh_frame_flags != 0xff) &&
+      !ParseReferenceOrderHint()) {
+    return false;
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) {
+    if (!ParseFrameSizeAndRenderSize()) return false;
+    if (frame_header_.allow_screen_content_tools &&
+        frame_header_.width == frame_header_.upscaled_width) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.allow_intrabc = scratch != 0;
+    }
+  } else {
+    if (!sequence_header_.enable_order_hint) {
+      frame_header_.frame_refs_short_signaling = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.frame_refs_short_signaling = scratch != 0;
+      if (frame_header_.frame_refs_short_signaling) {
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const int8_t last_frame_idx = scratch;
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const int8_t gold_frame_idx = scratch;
+        if (!SetFrameReferences(last_frame_idx, gold_frame_idx)) {
+          return false;
+        }
+      }
+    }
+    for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+      if (!frame_header_.frame_refs_short_signaling) {
+        OBU_READ_LITERAL_OR_FAIL(3);
+        frame_header_.reference_frame_index[i] = scratch;
+      }
+      const int reference_frame_index = frame_header_.reference_frame_index[i];
+      assert(reference_frame_index >= 0);
+      // Section 6.8.2: It is a requirement of bitstream conformance that
+      // RefValid[ ref_frame_idx[ i ] ] is equal to 1 ...
+      // The remainder of the statement is handled by ParseSequenceHeader().
+      // Note if support for Annex C: Error resilience behavior is added this
+      // check should be omitted per C.5 Decoder consequences of processable
+      // frames.
+      if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
+        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
+                     reference_frame_index);
+        return false;
+      }
+      if (sequence_header_.frame_id_numbers_present) {
+        OBU_READ_LITERAL_OR_FAIL(sequence_header_.delta_frame_id_length_bits);
+        const int delta_frame_id = static_cast<int>(1 + scratch);
+        const int id_length_max_value =
+            1 << sequence_header_.frame_id_length_bits;
+        frame_header_.expected_frame_id[i] =
+            (frame_header_.current_frame_id + id_length_max_value -
+             delta_frame_id) %
+            id_length_max_value;
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // whenever expectedFrameId[ i ] is calculated, the value matches
+        // RefFrameId[ ref_frame_idx[ i ] ] ...
+        if (frame_header_.expected_frame_id[i] !=
+            decoder_state_.reference_frame_id[reference_frame_index]) {
+          LIBGAV1_DLOG(ERROR,
+                       "Reference buffer %d has a frame id number mismatch.",
+                       reference_frame_index);
+          return false;
+        }
+      }
+    }
+    if (frame_header_.frame_size_override_flag &&
+        !frame_header_.error_resilient_mode) {
+      // Section 5.9.7.
+      for (int index : frame_header_.reference_frame_index) {
+        OBU_READ_BIT_OR_FAIL;
+        frame_header_.found_reference = scratch != 0;
+        if (frame_header_.found_reference) {
+          const RefCountedBuffer* reference_frame =
+              decoder_state_.reference_frame[index].get();
+          // frame_header_.upscaled_width will be set in the
+          // ParseSuperResParametersAndComputeImageSize() call below.
+          frame_header_.width = reference_frame->upscaled_width();
+          frame_header_.height = reference_frame->frame_height();
+          frame_header_.render_width = reference_frame->render_width();
+          frame_header_.render_height = reference_frame->render_height();
+          if (!ParseSuperResParametersAndComputeImageSize()) return false;
+          break;
+        }
+      }
+      if (!frame_header_.found_reference && !ParseFrameSizeAndRenderSize()) {
+        return false;
+      }
+    } else {
+      if (!ParseFrameSizeAndRenderSize()) return false;
+    }
+    if (!ValidateInterFrameSize()) return false;
+    if (frame_header_.force_integer_mv != 0) {
+      frame_header_.allow_high_precision_mv = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.allow_high_precision_mv = scratch != 0;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    const bool is_filter_switchable = scratch != 0;
+    if (is_filter_switchable) {
+      frame_header_.interpolation_filter = kInterpolationFilterSwitchable;
+    } else {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.interpolation_filter =
+          static_cast<InterpolationFilter>(scratch);
+    }
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.is_motion_mode_switchable = scratch != 0;
+    if (frame_header_.error_resilient_mode ||
+        !sequence_header_.enable_ref_frame_mvs) {
+      frame_header_.use_ref_frame_mvs = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.use_ref_frame_mvs = scratch != 0;
+    }
+  }
+  // At this point, we have parsed the frame and render sizes and computed
+  // the image size, whether it's an intra or inter frame. So we can save
+  // the sizes in the current frame now.
+  if (!current_frame_->SetFrameDimensions(frame_header_)) {
+    LIBGAV1_DLOG(ERROR, "Setting current frame dimensions failed.");
+    return false;
+  }
+  if (!IsIntraFrame(frame_header_.frame_type)) {
+    // Initialize the kReferenceFrameIntra type reference frame information to
+    // simplify the frame type validation in motion field projection.
+    // Set the kReferenceFrameIntra type |order_hint_| to
+    // |frame_header_.order_hint|. This guarantees that in SIMD implementations,
+    // the other reference frame information of the kReferenceFrameIntra type
+    // could be correctly initialized using the following loop with
+    // |frame_header_.order_hint| being the |hint|.
+    ReferenceInfo* const reference_info = current_frame_->reference_info();
+    reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint;
+    reference_info->relative_distance_from[kReferenceFrameIntra] = 0;
+    reference_info->relative_distance_to[kReferenceFrameIntra] = 0;
+    reference_info->skip_references[kReferenceFrameIntra] = true;
+    reference_info->projection_divisions[kReferenceFrameIntra] = 0;
+
+    for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) {
+      const auto reference_frame = static_cast<ReferenceFrameType>(i);
+      const uint8_t hint =
+          decoder_state_.reference_order_hint
+              [frame_header_.reference_frame_index[i - kReferenceFrameLast]];
+      reference_info->order_hint[reference_frame] = hint;
+      const int relative_distance_from =
+          GetRelativeDistance(hint, frame_header_.order_hint,
+                              sequence_header_.order_hint_shift_bits);
+      const int relative_distance_to =
+          GetRelativeDistance(frame_header_.order_hint, hint,
+                              sequence_header_.order_hint_shift_bits);
+      reference_info->relative_distance_from[reference_frame] =
+          relative_distance_from;
+      reference_info->relative_distance_to[reference_frame] =
+          relative_distance_to;
+      reference_info->skip_references[reference_frame] =
+          relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+      reference_info->projection_divisions[reference_frame] =
+          reference_info->skip_references[reference_frame]
+              ? 0
+              : kProjectionMvDivisionLookup[relative_distance_to];
+      decoder_state_.reference_frame_sign_bias[reference_frame] =
+          relative_distance_from > 0;
+    }
+  }
+  if (frame_header_.enable_cdf_update &&
+      !sequence_header_.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.enable_frame_end_update_cdf = scratch == 0;
+  } else {
+    frame_header_.enable_frame_end_update_cdf = false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseFrameHeader() {
+  // Section 6.8.1: It is a requirement of bitstream conformance that a
+  // sequence header OBU has been received before a frame header OBU.
+  if (!has_sequence_header_) return false;
+  if (!ParseFrameParameters()) return false;
+  if (frame_header_.show_existing_frame) return true;
+  assert(!obu_headers_.empty());
+  current_frame_->set_spatial_id(obu_headers_.back().spatial_id);
+  current_frame_->set_temporal_id(obu_headers_.back().temporal_id);
+  bool status = ParseTileInfoSyntax() && ParseQuantizerParameters() &&
+                ParseSegmentationParameters();
+  if (!status) return false;
+  current_frame_->SetSegmentationParameters(frame_header_.segmentation);
+  status =
+      ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters();
+  if (!status) return false;
+  ComputeSegmentLosslessAndQIndex();
+  // Section 6.8.2: It is a requirement of bitstream conformance that
+  // delta_q_present is equal to 0 when CodedLossless is equal to 1.
+  if (frame_header_.coded_lossless && frame_header_.delta_q.present) {
+    return false;
+  }
+  status = ParseLoopFilterParameters();
+  if (!status) return false;
+  current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter);
+  status = ParseCdefParameters() && ParseLoopRestorationParameters() &&
+           ParseTxModeSyntax() && ParseFrameReferenceModeSyntax() &&
+           ParseSkipModeParameters() && ReadAllowWarpedMotion();
+  if (!status) return false;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.reduced_tx_set = scratch != 0;
+  status = ParseGlobalMotionParameters();
+  if (!status) return false;
+  current_frame_->SetGlobalMotions(frame_header_.global_motion);
+  status = ParseFilmGrainParameters();
+  if (!status) return false;
+  if (sequence_header_.film_grain_params_present) {
+    current_frame_->set_film_grain_params(frame_header_.film_grain_params);
+  }
+  return true;
+}
+
+bool ObuParser::ParsePadding(const uint8_t* data, size_t size) {
+  // The spec allows a padding OBU to be header-only (i.e., |size| = 0). So
+  // check trailing bits only if |size| > 0.
+  if (size == 0) return true;
+  // The payload of a padding OBU is byte aligned. Therefore the first
+  // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+  const int i = GetLastNonzeroByteIndex(data, size);
+  if (i < 0) {
+    LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+    return false;
+  }
+  if (data[i] != 0x80) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "The last nonzero byte of the payload data is 0x%x, should be 0x80.",
+        data[i]);
+    return false;
+  }
+  // Skip all bits before the trailing bit.
+  bit_reader_->SkipBytes(i);
+  return true;
+}
+
+bool ObuParser::ParseMetadataScalability() {
+  int64_t scratch;
+  // scalability_mode_idc
+  OBU_READ_LITERAL_OR_FAIL(8);
+  const auto scalability_mode_idc = static_cast<int>(scratch);
+  if (scalability_mode_idc == kScalabilitySS) {
+    // Parse scalability_structure().
+    // spatial_layers_cnt_minus_1
+    OBU_READ_LITERAL_OR_FAIL(2);
+    const auto spatial_layers_count = static_cast<int>(scratch) + 1;
+    // spatial_layer_dimensions_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto spatial_layer_dimensions_present_flag = scratch != 0;
+    // spatial_layer_description_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto spatial_layer_description_present_flag = scratch != 0;
+    // temporal_group_description_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto temporal_group_description_present_flag = scratch != 0;
+    // scalability_structure_reserved_3bits
+    OBU_READ_LITERAL_OR_FAIL(3);
+    if (scratch != 0) {
+      LIBGAV1_DLOG(WARNING,
+                   "scalability_structure_reserved_3bits is not zero.");
+    }
+    if (spatial_layer_dimensions_present_flag) {
+      for (int i = 0; i < spatial_layers_count; ++i) {
+        // spatial_layer_max_width[i]
+        OBU_READ_LITERAL_OR_FAIL(16);
+        // spatial_layer_max_height[i]
+        OBU_READ_LITERAL_OR_FAIL(16);
+      }
+    }
+    if (spatial_layer_description_present_flag) {
+      for (int i = 0; i < spatial_layers_count; ++i) {
+        // spatial_layer_ref_id[i]
+        OBU_READ_LITERAL_OR_FAIL(8);
+      }
+    }
+    if (temporal_group_description_present_flag) {
+      // temporal_group_size
+      OBU_READ_LITERAL_OR_FAIL(8);
+      const auto temporal_group_size = static_cast<int>(scratch);
+      for (int i = 0; i < temporal_group_size; ++i) {
+        // temporal_group_temporal_id[i]
+        OBU_READ_LITERAL_OR_FAIL(3);
+        // temporal_group_temporal_switching_up_point_flag[i]
+        OBU_READ_BIT_OR_FAIL;
+        // temporal_group_spatial_switching_up_point_flag[i]
+        OBU_READ_BIT_OR_FAIL;
+        // temporal_group_ref_cnt[i]
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const auto temporal_group_ref_count = static_cast<int>(scratch);
+        for (int j = 0; j < temporal_group_ref_count; ++j) {
+          // temporal_group_ref_pic_diff[i][j]
+          OBU_READ_LITERAL_OR_FAIL(8);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseMetadataTimecode() {
+  int64_t scratch;
+  // counting_type: should be the same for all pictures in the coded video
+  // sequence. 7..31 are reserved.
+  OBU_READ_LITERAL_OR_FAIL(5);
+  // full_timestamp_flag
+  OBU_READ_BIT_OR_FAIL;
+  const bool full_timestamp_flag = scratch != 0;
+  // discontinuity_flag
+  OBU_READ_BIT_OR_FAIL;
+  // cnt_dropped_flag
+  OBU_READ_BIT_OR_FAIL;
+  // n_frames
+  OBU_READ_LITERAL_OR_FAIL(9);
+  if (full_timestamp_flag) {
+    // seconds_value
+    OBU_READ_LITERAL_OR_FAIL(6);
+    const auto seconds_value = static_cast<int>(scratch);
+    if (seconds_value > 59) {
+      LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+      return false;
+    }
+    // minutes_value
+    OBU_READ_LITERAL_OR_FAIL(6);
+    const auto minutes_value = static_cast<int>(scratch);
+    if (minutes_value > 59) {
+      LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+      return false;
+    }
+    // hours_value
+    OBU_READ_LITERAL_OR_FAIL(5);
+    const auto hours_value = static_cast<int>(scratch);
+    if (hours_value > 23) {
+      LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+      return false;
+    }
+  } else {
+    // seconds_flag
+    OBU_READ_BIT_OR_FAIL;
+    const bool seconds_flag = scratch != 0;
+    if (seconds_flag) {
+      // seconds_value
+      OBU_READ_LITERAL_OR_FAIL(6);
+      const auto seconds_value = static_cast<int>(scratch);
+      if (seconds_value > 59) {
+        LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+        return false;
+      }
+      // minutes_flag
+      OBU_READ_BIT_OR_FAIL;
+      const bool minutes_flag = scratch != 0;
+      if (minutes_flag) {
+        // minutes_value
+        OBU_READ_LITERAL_OR_FAIL(6);
+        const auto minutes_value = static_cast<int>(scratch);
+        if (minutes_value > 59) {
+          LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+          return false;
+        }
+        // hours_flag
+        OBU_READ_BIT_OR_FAIL;
+        const bool hours_flag = scratch != 0;
+        if (hours_flag) {
+          // hours_value
+          OBU_READ_LITERAL_OR_FAIL(5);
+          const auto hours_value = static_cast<int>(scratch);
+          if (hours_value > 23) {
+            LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+            return false;
+          }
+        }
+      }
+    }
+  }
+  // time_offset_length: should be the same for all pictures in the coded
+  // video sequence.
+  OBU_READ_LITERAL_OR_FAIL(5);
+  const auto time_offset_length = static_cast<int>(scratch);
+  if (time_offset_length > 0) {
+    // time_offset_value
+    OBU_READ_LITERAL_OR_FAIL(time_offset_length);
+  }
+  // Compute clockTimestamp. Section 6.7.7:
+  //   When timing_info_present_flag is equal to 1 and discontinuity_flag is
+  //   equal to 0, the value of clockTimestamp shall be greater than or equal
+  //   to the value of clockTimestamp for the previous set of clock timestamp
+  //   syntax elements in output order.
+  return true;
+}
+
+bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
+  const size_t start_offset = bit_reader_->byte_offset();
+  size_t metadata_type;
+  if (!bit_reader_->ReadUnsignedLeb128(&metadata_type)) {
+    LIBGAV1_DLOG(ERROR, "Could not read metadata_type.");
+    return false;
+  }
+  const size_t metadata_type_size = bit_reader_->byte_offset() - start_offset;
+  if (size < metadata_type_size) {
+    LIBGAV1_DLOG(
+        ERROR, "metadata_type is longer than metadata OBU payload %zu vs %zu.",
+        metadata_type_size, size);
+    return false;
+  }
+  data += metadata_type_size;
+  size -= metadata_type_size;
+  int64_t scratch;
+  switch (metadata_type) {
+    case kMetadataTypeHdrContentLightLevel: {
+      ObuMetadataHdrCll hdr_cll;
+      OBU_READ_LITERAL_OR_FAIL(16);
+      hdr_cll.max_cll = scratch;
+      OBU_READ_LITERAL_OR_FAIL(16);
+      hdr_cll.max_fall = scratch;
+      if (!EnsureCurrentFrameIsNotNull()) return false;
+      current_frame_->set_hdr_cll(hdr_cll);
+      break;
+    }
+    case kMetadataTypeHdrMasteringDisplayColorVolume: {
+      ObuMetadataHdrMdcv hdr_mdcv;
+      for (int i = 0; i < 3; ++i) {
+        OBU_READ_LITERAL_OR_FAIL(16);
+        hdr_mdcv.primary_chromaticity_x[i] = scratch;
+        OBU_READ_LITERAL_OR_FAIL(16);
+        hdr_mdcv.primary_chromaticity_y[i] = scratch;
+      }
+      OBU_READ_LITERAL_OR_FAIL(16);
+      hdr_mdcv.white_point_chromaticity_x = scratch;
+      OBU_READ_LITERAL_OR_FAIL(16);
+      hdr_mdcv.white_point_chromaticity_y = scratch;
+      OBU_READ_LITERAL_OR_FAIL(32);
+      hdr_mdcv.luminance_max = static_cast<uint32_t>(scratch);
+      OBU_READ_LITERAL_OR_FAIL(32);
+      hdr_mdcv.luminance_min = static_cast<uint32_t>(scratch);
+      if (!EnsureCurrentFrameIsNotNull()) return false;
+      current_frame_->set_hdr_mdcv(hdr_mdcv);
+      break;
+    }
+    case kMetadataTypeScalability:
+      if (!ParseMetadataScalability()) return false;
+      break;
+    case kMetadataTypeItutT35: {
+      ObuMetadataItutT35 itut_t35;
+      OBU_READ_LITERAL_OR_FAIL(8);
+      itut_t35.country_code = static_cast<uint8_t>(scratch);
+      ++data;
+      --size;
+      if (itut_t35.country_code == 0xFF) {
+        OBU_READ_LITERAL_OR_FAIL(8);
+        itut_t35.country_code_extension_byte = static_cast<uint8_t>(scratch);
+        ++data;
+        --size;
+      }
+      // Read itut_t35.payload_bytes. Section 6.7.2 of the spec says:
+      //   itut_t35.payload_bytes shall be bytes containing data registered as
+      //   specified in Recommendation ITU-T T.35.
+      // Therefore itut_t35.payload_bytes is byte aligned and the first trailing
+      // byte should be 0x80. Since the exact syntax of itut_t35.payload_bytes
+      // is not defined in the AV1 spec, identify the end of
+      // itut_t35.payload_bytes by searching for the trailing bit.
+      const int i = GetLastNonzeroByteIndex(data, size);
+      if (i < 0) {
+        LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+        return false;
+      }
+      if (data[i] != 0x80) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "itut_t35.payload_bytes is not byte aligned. The last nonzero byte "
+            "of the payload data is 0x%x, should be 0x80.",
+            data[i]);
+        return false;
+      }
+      itut_t35.payload_size = i;
+      if (!EnsureCurrentFrameIsNotNull() ||
+          !current_frame_->set_itut_t35(itut_t35, data)) {
+        return false;
+      }
+      // Skip all bits before the trailing bit.
+      bit_reader_->SkipBytes(i);
+      break;
+    }
+    case kMetadataTypeTimecode:
+      if (!ParseMetadataTimecode()) return false;
+      break;
+    default: {
+      // metadata_type is equal to a value reserved for future use or a user
+      // private value.
+      //
+      // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU
+      // if they do not understand the metadata_type." Find the trailing bit
+      // and skip all bits before the trailing bit.
+      const int i = GetLastNonzeroByteIndex(data, size);
+      if (i >= 0) {
+        // The last 1 bit in the last nonzero byte is the trailing bit. Skip
+        // all bits before the trailing bit.
+        const int n = CountTrailingZeros(data[i]);
+        bit_reader_->SkipBits(i * 8 + 7 - n);
+      }
+      break;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::AddTileBuffers(int start, int end, size_t total_size,
+                               size_t tg_header_size,
+                               size_t bytes_consumed_so_far) {
+  // Validate that the tile group start and end are within the allowed range.
+  if (start != next_tile_group_start_ || start > end ||
+      end >= frame_header_.tile_info.tile_count) {
+    LIBGAV1_DLOG(ERROR,
+                 "Invalid tile group start %d or end %d: expected tile group "
+                 "start %d, tile_count %d.",
+                 start, end, next_tile_group_start_,
+                 frame_header_.tile_info.tile_count);
+    return false;
+  }
+  next_tile_group_start_ = end + 1;
+
+  if (total_size < tg_header_size) {
+    LIBGAV1_DLOG(ERROR, "total_size (%zu) is less than tg_header_size (%zu).)",
+                 total_size, tg_header_size);
+    return false;
+  }
+  size_t bytes_left = total_size - tg_header_size;
+  const uint8_t* data = data_ + bytes_consumed_so_far + tg_header_size;
+  for (int tile_number = start; tile_number <= end; ++tile_number) {
+    size_t tile_size = 0;
+    if (tile_number != end) {
+      RawBitReader bit_reader(data, bytes_left);
+      if (!bit_reader.ReadLittleEndian(frame_header_.tile_info.tile_size_bytes,
+                                       &tile_size)) {
+        LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d",
+                     tile_number);
+        return false;
+      }
+      ++tile_size;
+      data += frame_header_.tile_info.tile_size_bytes;
+      bytes_left -= frame_header_.tile_info.tile_size_bytes;
+      if (tile_size > bytes_left) {
+        LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+                     tile_number);
+        return false;
+      }
+    } else {
+      tile_size = bytes_left;
+      if (tile_size == 0) {
+        LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+                     tile_number);
+        return false;
+      }
+    }
+    // The memory for this has been allocated in ParseTileInfoSyntax(). So it is
+    // safe to use push_back_unchecked here.
+    tile_buffers_.push_back_unchecked({data, tile_size});
+    data += tile_size;
+    bytes_left -= tile_size;
+  }
+  bit_reader_->SkipBytes(total_size - tg_header_size);
+  return true;
+}
+
+bool ObuParser::ParseTileGroup(size_t size, size_t bytes_consumed_so_far) {
+  const TileInfo* const tile_info = &frame_header_.tile_info;
+  const size_t start_offset = bit_reader_->byte_offset();
+  const int tile_bits =
+      tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+  if (tile_bits == 0) {
+    return AddTileBuffers(0, 0, size, 0, bytes_consumed_so_far);
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  const bool tile_start_and_end_present_flag = scratch != 0;
+  if (!tile_start_and_end_present_flag) {
+    if (!bit_reader_->AlignToNextByte()) {
+      LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+      return false;
+    }
+    return AddTileBuffers(0, tile_info->tile_count - 1, size, 1,
+                          bytes_consumed_so_far);
+  }
+  if (obu_headers_.back().type == kObuFrame) {
+    // 6.10.1: If obu_type is equal to OBU_FRAME, it is a requirement of
+    // bitstream conformance that the value of tile_start_and_end_present_flag
+    // is equal to 0.
+    LIBGAV1_DLOG(ERROR,
+                 "tile_start_and_end_present_flag must be 0 in Frame OBU");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(tile_bits);
+  const int start = static_cast<int>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(tile_bits);
+  const int end = static_cast<int>(scratch);
+  if (!bit_reader_->AlignToNextByte()) {
+    LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+    return false;
+  }
+  const size_t tg_header_size = bit_reader_->byte_offset() - start_offset;
+  return AddTileBuffers(start, end, size, tg_header_size,
+                        bytes_consumed_so_far);
+}
+
+bool ObuParser::ParseHeader() {
+  ObuHeader obu_header;
+  int64_t scratch = bit_reader_->ReadBit();
+  if (scratch != 0) {
+    LIBGAV1_DLOG(ERROR, "forbidden_bit is not zero.");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(4);
+  obu_header.type = static_cast<libgav1::ObuType>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  const bool extension_flag = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  obu_header.has_size_field = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;  // reserved.
+  if (scratch != 0) {
+    LIBGAV1_DLOG(WARNING, "obu_reserved_1bit is not zero.");
+  }
+  obu_header.has_extension = extension_flag;
+  if (extension_flag) {
+    if (extension_disallowed_) {
+      LIBGAV1_DLOG(ERROR,
+                   "OperatingPointIdc is 0, but obu_extension_flag is 1.");
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(3);
+    obu_header.temporal_id = scratch;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    obu_header.spatial_id = scratch;
+    OBU_READ_LITERAL_OR_FAIL(3);  // reserved.
+    if (scratch != 0) {
+      LIBGAV1_DLOG(WARNING, "extension_header_reserved_3bits is not zero.");
+    }
+  } else {
+    obu_header.temporal_id = 0;
+    obu_header.spatial_id = 0;
+  }
+  return obu_headers_.push_back(obu_header);
+}
+
+#undef OBU_READ_UVLC_OR_FAIL
+#undef OBU_READ_LITERAL_OR_FAIL
+#undef OBU_READ_BIT_OR_FAIL
+#undef OBU_PARSER_FAIL
+#undef OBU_LOG_AND_RETURN_FALSE
+
+bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) {
+  bit_reader_.reset(new (std::nothrow) RawBitReader(data, size));
+  return bit_reader_ != nullptr;
+}
+
+bool ObuParser::EnsureCurrentFrameIsNotNull() {
+  if (current_frame_ != nullptr) return true;
+  current_frame_ = buffer_pool_->GetFreeBuffer();
+  if (current_frame_ == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+    return false;
+  }
+  return true;
+}
+
+bool ObuParser::HasData() const { return size_ > 0; }
+
+StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
+  if (data_ == nullptr || size_ == 0) return kStatusInvalidArgument;
+
+  assert(current_frame_ == nullptr);
+  // This is used to release any references held in case of parsing failure.
+  RefCountedBufferPtrCleanup current_frame_cleanup(&current_frame_);
+
+  const uint8_t* data = data_;
+  size_t size = size_;
+
+  // Clear everything except the sequence header.
+  obu_headers_.clear();
+  frame_header_ = {};
+  tile_buffers_.clear();
+  next_tile_group_start_ = 0;
+  sequence_header_changed_ = false;
+
+  bool parsed_one_full_frame = false;
+  bool seen_frame_header = false;
+  const uint8_t* frame_header = nullptr;
+  size_t frame_header_size_in_bits = 0;
+  while (size > 0 && !parsed_one_full_frame) {
+    if (!InitBitReader(data, size)) {
+      LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader.");
+      return kStatusOutOfMemory;
+    }
+    if (!ParseHeader()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header.");
+      return kStatusBitstreamError;
+    }
+    const ObuHeader& obu_header = obu_headers_.back();
+    if (!obu_header.has_size_field) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "has_size_field is zero. libgav1 does not support such streams.");
+      return kStatusUnimplemented;
+    }
+    const size_t obu_header_size = bit_reader_->byte_offset();
+    size_t obu_size;
+    if (!bit_reader_->ReadUnsignedLeb128(&obu_size)) {
+      LIBGAV1_DLOG(ERROR, "Could not read OBU size.");
+      return kStatusBitstreamError;
+    }
+    const size_t obu_length_size = bit_reader_->byte_offset() - obu_header_size;
+    if (size - bit_reader_->byte_offset() < obu_size) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.",
+                   size - bit_reader_->bit_offset(), obu_size);
+      return kStatusBitstreamError;
+    }
+
+    const ObuType obu_type = obu_header.type;
+    if (obu_type != kObuSequenceHeader && obu_type != kObuTemporalDelimiter &&
+        has_sequence_header_ &&
+        sequence_header_.operating_point_idc[operating_point_] != 0 &&
+        obu_header.has_extension &&
+        (!InTemporalLayer(
+             sequence_header_.operating_point_idc[operating_point_],
+             obu_header.temporal_id) ||
+         !InSpatialLayer(sequence_header_.operating_point_idc[operating_point_],
+                         obu_header.spatial_id))) {
+      obu_headers_.pop_back();
+      bit_reader_->SkipBytes(obu_size);
+      data += bit_reader_->byte_offset();
+      size -= bit_reader_->byte_offset();
+      continue;
+    }
+
+    const size_t obu_start_position = bit_reader_->bit_offset();
+    // The bit_reader_ is byte aligned after reading obu_header and obu_size.
+    // Therefore the byte offset can be computed as obu_start_position >> 3
+    // below.
+    assert((obu_start_position & 7) == 0);
+    bool obu_skipped = false;
+    switch (obu_type) {
+      case kObuTemporalDelimiter:
+        break;
+      case kObuSequenceHeader:
+        if (!ParseSequenceHeader(seen_frame_header)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU.");
+          return kStatusBitstreamError;
+        }
+        if (sequence_header_.color_config.bitdepth > LIBGAV1_MAX_BITDEPTH) {
+          LIBGAV1_DLOG(
+              ERROR,
+              "Bitdepth %d is not supported. The maximum bitdepth is %d.",
+              sequence_header_.color_config.bitdepth, LIBGAV1_MAX_BITDEPTH);
+          return kStatusUnimplemented;
+        }
+        break;
+      case kObuFrameHeader:
+        if (seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Frame header found but frame header was already seen.");
+          return kStatusBitstreamError;
+        }
+        if (!ParseFrameHeader()) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader OBU.");
+          return kStatusBitstreamError;
+        }
+        frame_header = &data[obu_start_position >> 3];
+        frame_header_size_in_bits =
+            bit_reader_->bit_offset() - obu_start_position;
+        seen_frame_header = true;
+        parsed_one_full_frame = frame_header_.show_existing_frame;
+        break;
+      case kObuRedundantFrameHeader: {
+        if (!seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Redundant frame header found but frame header was not "
+                       "yet seen.");
+          return kStatusBitstreamError;
+        }
+        const size_t fh_size = (frame_header_size_in_bits + 7) >> 3;
+        if (obu_size < fh_size ||
+            memcmp(frame_header, &data[obu_start_position >> 3], fh_size) !=
+                0) {
+          LIBGAV1_DLOG(ERROR,
+                       "Redundant frame header differs from frame header.");
+          return kStatusBitstreamError;
+        }
+        bit_reader_->SkipBits(frame_header_size_in_bits);
+        break;
+      }
+      case kObuFrame: {
+        const size_t fh_start_offset = bit_reader_->byte_offset();
+        if (seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Frame header found but frame header was already seen.");
+          return kStatusBitstreamError;
+        }
+        if (!ParseFrameHeader()) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader in Frame OBU.");
+          return kStatusBitstreamError;
+        }
+        // Section 6.8.2: If obu_type is equal to OBU_FRAME, it is a
+        // requirement of bitstream conformance that show_existing_frame is
+        // equal to 0.
+        if (frame_header_.show_existing_frame) {
+          LIBGAV1_DLOG(ERROR, "Frame OBU cannot set show_existing_frame to 1.");
+          return kStatusBitstreamError;
+        }
+        if (!bit_reader_->AlignToNextByte()) {
+          LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+          return kStatusBitstreamError;
+        }
+        const size_t fh_size = bit_reader_->byte_offset() - fh_start_offset;
+        if (fh_size >= obu_size) {
+          LIBGAV1_DLOG(ERROR, "Frame header size (%zu) >= obu_size (%zu).",
+                       fh_size, obu_size);
+          return kStatusBitstreamError;
+        }
+        if (!ParseTileGroup(obu_size - fh_size,
+                            size_ - size + bit_reader_->byte_offset())) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup in Frame OBU.");
+          return kStatusBitstreamError;
+        }
+        parsed_one_full_frame = true;
+        break;
+      }
+      case kObuTileGroup:
+        if (!ParseTileGroup(obu_size,
+                            size_ - size + bit_reader_->byte_offset())) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup OBU.");
+          return kStatusBitstreamError;
+        }
+        parsed_one_full_frame =
+            (next_tile_group_start_ == frame_header_.tile_info.tile_count);
+        break;
+      case kObuTileList:
+        LIBGAV1_DLOG(ERROR, "Decoding of tile list OBUs is not supported.");
+        return kStatusUnimplemented;
+      case kObuPadding:
+        if (!ParsePadding(&data[obu_start_position >> 3], obu_size)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse Padding OBU.");
+          return kStatusBitstreamError;
+        }
+        break;
+      case kObuMetadata:
+        if (!ParseMetadata(&data[obu_start_position >> 3], obu_size)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse Metadata OBU.");
+          return kStatusBitstreamError;
+        }
+        break;
+      default:
+        // Skip reserved OBUs. Section 6.2.2: Reserved units are for future use
+        // and shall be ignored by AV1 decoder.
+        bit_reader_->SkipBytes(obu_size);
+        obu_skipped = true;
+        break;
+    }
+    if (obu_size > 0 && !obu_skipped && obu_type != kObuFrame &&
+        obu_type != kObuTileGroup) {
+      const size_t parsed_obu_size_in_bits =
+          bit_reader_->bit_offset() - obu_start_position;
+      if (obu_size * 8 < parsed_obu_size_in_bits) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "Parsed OBU size (%zu bits) is greater than expected OBU size "
+            "(%zu bytes) obu_type: %d.",
+            parsed_obu_size_in_bits, obu_size, obu_type);
+        return kStatusBitstreamError;
+      }
+      if (!bit_reader_->VerifyAndSkipTrailingBits(obu_size * 8 -
+                                                  parsed_obu_size_in_bits)) {
+        LIBGAV1_DLOG(ERROR,
+                     "Error when verifying trailing bits for obu type: %d",
+                     obu_type);
+        return kStatusBitstreamError;
+      }
+    }
+    const size_t bytes_consumed = bit_reader_->byte_offset();
+    const size_t consumed_obu_size =
+        bytes_consumed - obu_length_size - obu_header_size;
+    if (consumed_obu_size != obu_size) {
+      LIBGAV1_DLOG(ERROR,
+                   "OBU size (%zu) and consumed size (%zu) does not match for "
+                   "obu_type: %d.",
+                   obu_size, consumed_obu_size, obu_type);
+      return kStatusBitstreamError;
+    }
+    data += bytes_consumed;
+    size -= bytes_consumed;
+  }
+  if (!parsed_one_full_frame && seen_frame_header) {
+    LIBGAV1_DLOG(ERROR, "The last tile group in the frame was not received.");
+    return kStatusBitstreamError;
+  }
+  data_ = data;
+  size_ = size;
+  *current_frame = std::move(current_frame_);
+  return kStatusOk;
+}
+
+// AV1CodecConfigurationBox specification:
+// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox.
+// static
+std::unique_ptr<uint8_t[]> ObuParser::GetAV1CodecConfigurationBox(
+    const uint8_t* data, size_t size, size_t* const av1c_size) {
+  if (data == nullptr || av1c_size == nullptr) return nullptr;
+
+  ObuSequenceHeader sequence_header;
+  size_t sequence_header_offset;
+  size_t sequence_header_size;
+  const StatusCode status =
+      ParseBasicStreamInfo(data, size, &sequence_header,
+                           &sequence_header_offset, &sequence_header_size);
+  if (status != kStatusOk) {
+    *av1c_size = 0;
+    return nullptr;
+  }
+
+  *av1c_size = 4 + sequence_header_size;
+  std::unique_ptr<uint8_t[]> av1c_ptr(new (std::nothrow) uint8_t[*av1c_size]);
+  if (av1c_ptr == nullptr) {
+    *av1c_size = 0;
+    return nullptr;
+  }
+  uint8_t* av1c = av1c_ptr.get();
+  // unsigned int (1) marker = 1;
+  // unsigned int (7) version = 1;
+  av1c[0] = 0x81;
+
+  // unsigned int (3) seq_profile;
+  // unsigned int (5) seq_level_idx_0;
+  const uint8_t seq_level_idx_0 = ((sequence_header.level[0].major - 2) << 2) |
+                                  sequence_header.level[0].minor;
+  av1c[1] = (sequence_header.profile << 5) | seq_level_idx_0;
+
+  // unsigned int (1) seq_tier_0;
+  // unsigned int (1) high_bitdepth;
+  // unsigned int (1) twelve_bit;
+  // unsigned int (1) monochrome;
+  // unsigned int (1) chroma_subsampling_x;
+  // unsigned int (1) chroma_subsampling_y;
+  // unsigned int (2) chroma_sample_position;
+  const auto high_bitdepth =
+      static_cast<uint8_t>(sequence_header.color_config.bitdepth > 8);
+  const auto twelve_bit =
+      static_cast<uint8_t>(sequence_header.color_config.bitdepth == 12);
+  av1c[2] =
+      (sequence_header.tier[0] << 7) | (high_bitdepth << 6) |
+      (twelve_bit << 5) |
+      (static_cast<uint8_t>(sequence_header.color_config.is_monochrome) << 4) |
+      (sequence_header.color_config.subsampling_x << 3) |
+      (sequence_header.color_config.subsampling_y << 2) |
+      sequence_header.color_config.chroma_sample_position;
+
+  // unsigned int (3) reserved = 0;
+  // unsigned int (1) initial_presentation_delay_present;
+  // if (initial_presentation_delay_present) {
+  //   unsigned int (4) initial_presentation_delay_minus_one;
+  // } else {
+  //   unsigned int (4) reserved = 0;
+  // }
+  av1c[3] = 0;
+
+  // unsigned int (8) configOBUs[];
+  memcpy(av1c + 4, data + sequence_header_offset, sequence_header_size);
+
+  return av1c_ptr;
+}
+
+// static
+StatusCode ObuParser::ParseBasicStreamInfo(const uint8_t* data, size_t size,
+                                           ObuSequenceHeader* sequence_header,
+                                           size_t* sequence_header_offset,
+                                           size_t* sequence_header_size) {
+  DecoderState state;
+  ObuParser parser(nullptr, 0, 0, nullptr, &state);
+  if (!parser.InitBitReader(data, size)) {
+    LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader.");
+    return kStatusOutOfMemory;
+  }
+  while (!parser.bit_reader_->Finished()) {
+    const size_t obu_start_offset = parser.bit_reader_->byte_offset();
+    if (!parser.ParseHeader()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header.");
+      return kStatusBitstreamError;
+    }
+    const ObuHeader& obu_header = parser.obu_headers_.back();
+    if (!obu_header.has_size_field) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "has_size_field is zero. libgav1 does not support such streams.");
+      return kStatusUnimplemented;
+    }
+    size_t obu_size;
+    if (!parser.bit_reader_->ReadUnsignedLeb128(&obu_size)) {
+      LIBGAV1_DLOG(ERROR, "Could not read OBU size.");
+      return kStatusBitstreamError;
+    }
+    if (size - parser.bit_reader_->byte_offset() < obu_size) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.",
+                   size - parser.bit_reader_->bit_offset(), obu_size);
+      return kStatusBitstreamError;
+    }
+    if (obu_header.type != kObuSequenceHeader) {
+      parser.obu_headers_.pop_back();
+      parser.bit_reader_->SkipBytes(obu_size);
+      continue;
+    }
+    const size_t obu_start_position = parser.bit_reader_->bit_offset();
+    if (!parser.ParseSequenceHeader(false)) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU.");
+      return kStatusBitstreamError;
+    }
+    const size_t parsed_obu_size_in_bits =
+        parser.bit_reader_->bit_offset() - obu_start_position;
+    const uint64_t obu_size_in_bits = static_cast<uint64_t>(obu_size) * 8;
+    if (obu_size_in_bits < parsed_obu_size_in_bits) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "Parsed OBU size (%zu bits) is greater than expected OBU size "
+          "(%zu bytes)..",
+          parsed_obu_size_in_bits, obu_size);
+      return kStatusBitstreamError;
+    }
+    if (!parser.bit_reader_->VerifyAndSkipTrailingBits(
+            static_cast<size_t>(obu_size_in_bits - parsed_obu_size_in_bits))) {
+      LIBGAV1_DLOG(
+          ERROR, "Error when verifying trailing bits for the sequence header.");
+      return kStatusBitstreamError;
+    }
+    *sequence_header = parser.sequence_header_;
+    *sequence_header_offset = obu_start_offset;
+    *sequence_header_size =
+        parser.bit_reader_->byte_offset() - obu_start_offset;
+    return kStatusOk;
+  }
+  // Sequence header was never found.
+  return kStatusBitstreamError;
+}
+
+}  // namespace libgav1
diff --git a/src/obu_parser.h b/src/obu_parser.h
new file mode 100644 (file)
index 0000000..594e86b
--- /dev/null
@@ -0,0 +1,412 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_OBU_PARSER_H_
+#define LIBGAV1_SRC_OBU_PARSER_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/quantizer.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// structs and enums related to Open Bitstream Units (OBU).
+
+enum {
+  kMinimumMajorBitstreamLevel = 2,
+  kSelectScreenContentTools = 2,
+  kSelectIntegerMv = 2,
+  kLoopRestorationTileSizeMax = 256,
+  kGlobalMotionAlphaBits = 12,
+  kGlobalMotionTranslationBits = 12,
+  kGlobalMotionTranslationOnlyBits = 9,
+  kGlobalMotionAlphaPrecisionBits = 15,
+  kGlobalMotionTranslationPrecisionBits = 6,
+  kGlobalMotionTranslationOnlyPrecisionBits = 3,
+  kMaxTileWidth = 4096,
+  kMaxTileArea = 4096 * 2304,
+  kPrimaryReferenceNone = 7,
+  // A special value of the scalability_mode_idc syntax element that indicates
+  // the picture prediction structure is specified in scalability_structure().
+  kScalabilitySS = 14
+};  // anonymous enum
+
+struct ObuHeader {
+  ObuType type;
+  bool has_extension;
+  bool has_size_field;
+  int8_t temporal_id;
+  int8_t spatial_id;
+};
+
+enum BitstreamProfile : uint8_t {
+  kProfile0,
+  kProfile1,
+  kProfile2,
+  kMaxProfiles
+};
+
+// In the bitstream the level is encoded in five bits: the first three bits
+// encode |major| - 2 and the last two bits encode |minor|.
+//
+// If the mapped level (major.minor) is in the tables in Annex A.3, there are
+// bitstream conformance requirements on the maximum or minimum values of
+// several variables. The encoded value of 31 (which corresponds to the mapped
+// level 9.3) is the "maximum parameters" level and imposes no level-based
+// constraints on the bitstream.
+struct BitStreamLevel {
+  uint8_t major;  // Range: 2-9.
+  uint8_t minor;  // Range: 0-3.
+};
+
+struct ColorConfig {
+  int8_t bitdepth;
+  bool is_monochrome;
+  ColorPrimary color_primary;
+  TransferCharacteristics transfer_characteristics;
+  MatrixCoefficients matrix_coefficients;
+  // A binary value (0 or 1) that is associated with the VideoFullRangeFlag
+  // variable specified in ISO/IEC 23091-4/ITUT H.273.
+  // * 0: the studio swing representation.
+  // * 1: the full swing representation.
+  ColorRange color_range;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  ChromaSamplePosition chroma_sample_position;
+  bool separate_uv_delta_q;
+};
+
+struct TimingInfo {
+  uint32_t num_units_in_tick;
+  uint32_t time_scale;
+  bool equal_picture_interval;
+  uint32_t num_ticks_per_picture;
+};
+
+struct DecoderModelInfo {
+  uint8_t encoder_decoder_buffer_delay_length;
+  uint32_t num_units_in_decoding_tick;
+  uint8_t buffer_removal_time_length;
+  uint8_t frame_presentation_time_length;
+};
+
+struct OperatingParameters {
+  uint32_t decoder_buffer_delay[kMaxOperatingPoints];
+  uint32_t encoder_buffer_delay[kMaxOperatingPoints];
+  bool low_delay_mode_flag[kMaxOperatingPoints];
+};
+
+struct ObuSequenceHeader {
+  // Section 7.5:
+  //   Within a particular coded video sequence, the contents of
+  //   sequence_header_obu must be bit-identical each time the sequence header
+  //   appears except for the contents of operating_parameters_info. A new
+  //   coded video sequence is required if the sequence header parameters
+  //   change.
+  //
+  // IMPORTANT: ParametersChanged() is implemented with a memcmp() call. For
+  // this to work, this object and the |old| object must be initialized with
+  // an empty brace-enclosed list, which initializes any padding to zero bits.
+  // See https://en.cppreference.com/w/cpp/language/zero_initialization.
+  bool ParametersChanged(const ObuSequenceHeader& old) const;
+
+  BitstreamProfile profile;
+  bool still_picture;
+  bool reduced_still_picture_header;
+  int operating_points;
+  int operating_point_idc[kMaxOperatingPoints];
+  BitStreamLevel level[kMaxOperatingPoints];
+  int8_t tier[kMaxOperatingPoints];
+  int8_t frame_width_bits;
+  int8_t frame_height_bits;
+  int32_t max_frame_width;
+  int32_t max_frame_height;
+  bool frame_id_numbers_present;
+  int8_t frame_id_length_bits;
+  int8_t delta_frame_id_length_bits;
+  bool use_128x128_superblock;
+  bool enable_filter_intra;
+  bool enable_intra_edge_filter;
+  bool enable_interintra_compound;
+  bool enable_masked_compound;
+  bool enable_warped_motion;
+  bool enable_dual_filter;
+  bool enable_order_hint;
+  // If enable_order_hint is true, order_hint_bits is in the range [1, 8].
+  // If enable_order_hint is false, order_hint_bits is 0.
+  int8_t order_hint_bits;
+  // order_hint_shift_bits equals (32 - order_hint_bits) % 32.
+  // This is used frequently in GetRelativeDistance().
+  uint8_t order_hint_shift_bits;
+  bool enable_jnt_comp;
+  bool enable_ref_frame_mvs;
+  bool choose_screen_content_tools;
+  int8_t force_screen_content_tools;
+  bool choose_integer_mv;
+  int8_t force_integer_mv;
+  bool enable_superres;
+  bool enable_cdef;
+  bool enable_restoration;
+  ColorConfig color_config;
+  bool timing_info_present_flag;
+  TimingInfo timing_info;
+  bool decoder_model_info_present_flag;
+  DecoderModelInfo decoder_model_info;
+  bool decoder_model_present_for_operating_point[kMaxOperatingPoints];
+  bool initial_display_delay_present_flag;
+  uint8_t initial_display_delay[kMaxOperatingPoints];
+  bool film_grain_params_present;
+
+  // IMPORTANT: the operating_parameters member must be at the end of the
+  // struct so that ParametersChanged() can be implemented with a memcmp()
+  // call.
+  OperatingParameters operating_parameters;
+};
+// Verify it is safe to use offsetof with ObuSequenceHeader and to use memcmp
+// to compare two ObuSequenceHeader objects.
+static_assert(std::is_standard_layout<ObuSequenceHeader>::value, "");
+// Verify operating_parameters is the last member of ObuSequenceHeader. The
+// second assertion assumes that ObuSequenceHeader has no padding after the
+// operating_parameters field. The first assertion is a sufficient condition
+// for ObuSequenceHeader to have no padding after the operating_parameters
+// field.
+static_assert(alignof(ObuSequenceHeader) == alignof(OperatingParameters), "");
+static_assert(sizeof(ObuSequenceHeader) ==
+                  offsetof(ObuSequenceHeader, operating_parameters) +
+                      sizeof(OperatingParameters),
+              "");
+
+struct TileBuffer {
+  const uint8_t* data;
+  size_t size;
+};
+
+enum MetadataType : uint8_t {
+  // 0 is reserved for AOM use.
+  kMetadataTypeHdrContentLightLevel = 1,
+  kMetadataTypeHdrMasteringDisplayColorVolume = 2,
+  kMetadataTypeScalability = 3,
+  kMetadataTypeItutT35 = 4,
+  kMetadataTypeTimecode = 5,
+  // 6-31 are unregistered user private.
+  // 32 and greater are reserved for AOM use.
+};
+
+class ObuParser : public Allocable {
+ public:
+  ObuParser(const uint8_t* const data, size_t size, int operating_point,
+            BufferPool* const buffer_pool, DecoderState* const decoder_state)
+      : data_(data),
+        size_(size),
+        operating_point_(operating_point),
+        buffer_pool_(buffer_pool),
+        decoder_state_(*decoder_state) {}
+
+  // Not copyable or movable.
+  ObuParser(const ObuParser& rhs) = delete;
+  ObuParser& operator=(const ObuParser& rhs) = delete;
+
+  // Returns true if there is more data that needs to be parsed.
+  bool HasData() const;
+
+  // Parses a sequence of Open Bitstream Units until a decodable frame is found
+  // (or until the end of stream is reached). A decodable frame is considered to
+  // be found when one of the following happens:
+  //   * A kObuFrame is seen.
+  //   * The kObuTileGroup containing the last tile is seen.
+  //   * A kFrameHeader with show_existing_frame = true is seen.
+  //
+  // If the parsing is successful, relevant fields will be populated. The fields
+  // are valid only if the return value is kStatusOk. Returns kStatusOk on
+  // success, an error status otherwise. On success, |current_frame| will be
+  // populated with a valid frame buffer.
+  StatusCode ParseOneFrame(RefCountedBufferPtr* current_frame);
+
+  // Get the AV1CodecConfigurationBox as described in
+  // https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox. This
+  // does minimal bitstream parsing to obtain the necessary information to
+  // generate the av1c box. Returns a std::unique_ptr that contains the av1c
+  // data on success, nullptr otherwise. |av1c_size| must not be nullptr and
+  // will contain the size of the buffer pointed to by the std::unique_ptr.
+  static std::unique_ptr<uint8_t[]> GetAV1CodecConfigurationBox(
+      const uint8_t* data, size_t size, size_t* av1c_size);
+
+  // Getters. Only valid if ParseOneFrame() completes successfully.
+  const Vector<ObuHeader>& obu_headers() const { return obu_headers_; }
+  const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+  const ObuFrameHeader& frame_header() const { return frame_header_; }
+  const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
+  // Returns true if the last call to ParseOneFrame() encountered a sequence
+  // header change.
+  bool sequence_header_changed() const { return sequence_header_changed_; }
+
+  // Setters.
+  void set_sequence_header(const ObuSequenceHeader& sequence_header) {
+    sequence_header_ = sequence_header;
+    has_sequence_header_ = true;
+  }
+
+  // Moves |tile_buffers_| into |tile_buffers|.
+  void MoveTileBuffers(Vector<TileBuffer>* tile_buffers) {
+    *tile_buffers = std::move(tile_buffers_);
+  }
+
+ private:
+  // Initializes the bit reader. This is a function of its own to make unit
+  // testing of private functions simpler.
+  LIBGAV1_MUST_USE_RESULT bool InitBitReader(const uint8_t* data, size_t size);
+
+  // Parse helper functions.
+  bool ParseHeader();  // 5.3.2 and 5.3.3.
+  bool ParseColorConfig(ObuSequenceHeader* sequence_header);       // 5.5.2.
+  bool ParseTimingInfo(ObuSequenceHeader* sequence_header);        // 5.5.3.
+  bool ParseDecoderModelInfo(ObuSequenceHeader* sequence_header);  // 5.5.4.
+  bool ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+                                int index);          // 5.5.5.
+  bool ParseSequenceHeader(bool seen_frame_header);  // 5.5.1.
+  bool ParseFrameParameters();                       // 5.9.2, 5.9.7 and 5.9.10.
+  void MarkInvalidReferenceFrames();                 // 5.9.4.
+  bool ParseFrameSizeAndRenderSize();                // 5.9.5 and 5.9.6.
+  bool ParseSuperResParametersAndComputeImageSize();  // 5.9.8 and 5.9.9.
+  // Checks the bitstream conformance requirement in Section 6.8.6.
+  bool ValidateInterFrameSize() const;
+  bool ParseReferenceOrderHint();
+  static int FindLatestBackwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindEarliestBackwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindLatestForwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindReferenceWithSmallestOutputOrder(
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints);
+  bool SetFrameReferences(int8_t last_frame_idx,
+                          int8_t gold_frame_idx);  // 7.8.
+  bool ParseLoopFilterParameters();                // 5.9.11.
+  bool ParseDeltaQuantizer(int8_t* delta);         // 5.9.13.
+  bool ParseQuantizerParameters();                 // 5.9.12.
+  bool ParseSegmentationParameters();              // 5.9.14.
+  bool ParseQuantizerIndexDeltaParameters();       // 5.9.17.
+  bool ParseLoopFilterDeltaParameters();           // 5.9.18.
+  void ComputeSegmentLosslessAndQIndex();
+  bool ParseCdefParameters();             // 5.9.19.
+  bool ParseLoopRestorationParameters();  // 5.9.20.
+  bool ParseTxModeSyntax();               // 5.9.21.
+  bool ParseFrameReferenceModeSyntax();   // 5.9.23.
+  // Returns whether skip mode is allowed. When it returns true, it also sets
+  // the frame_header_.skip_mode_frame array.
+  bool IsSkipModeAllowed();
+  bool ParseSkipModeParameters();  // 5.9.22.
+  bool ReadAllowWarpedMotion();
+  bool ParseGlobalParamSyntax(
+      int ref, int index,
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+          prev_global_motions);        // 5.9.25.
+  bool ParseGlobalMotionParameters();  // 5.9.24.
+  bool ParseFilmGrainParameters();     // 5.9.30.
+  bool ParseTileInfoSyntax();          // 5.9.15.
+  bool ParseFrameHeader();             // 5.9.
+  // |data| and |size| specify the payload data of the padding OBU.
+  // NOTE: Although the payload data is available in the bit_reader_ member,
+  // it is also passed to ParsePadding() as function parameters so that
+  // ParsePadding() can find the trailing bit of the OBU and skip over the
+  // payload data as an opaque chunk of data.
+  bool ParsePadding(const uint8_t* data, size_t size);  // 5.7.
+  bool ParseMetadataScalability();                      // 5.8.5 and 5.8.6.
+  bool ParseMetadataTimecode();                         // 5.8.7.
+  // |data| and |size| specify the payload data of the metadata OBU.
+  // NOTE: Although the payload data is available in the bit_reader_ member,
+  // it is also passed to ParseMetadata() as function parameters so that
+  // ParseMetadata() can find the trailing bit of the OBU and either extract
+  // or skip over the payload data as an opaque chunk of data.
+  bool ParseMetadata(const uint8_t* data, size_t size);  // 5.8.
+  // Adds and populates the TileBuffer for each tile in the tile group and
+  // updates |next_tile_group_start_|
+  bool AddTileBuffers(int start, int end, size_t total_size,
+                      size_t tg_header_size, size_t bytes_consumed_so_far);
+  bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far);  // 5.11.1.
+
+  // Populates |current_frame_| from the |buffer_pool_| if |current_frame_| is
+  // nullptr. Does not do anything otherwise. Returns true on success, false
+  // otherwise.
+  bool EnsureCurrentFrameIsNotNull();
+
+  // Parses the basic bitstream information from the given AV1 stream in |data|.
+  // This is used for generating the AV1CodecConfigurationBox.
+  static StatusCode ParseBasicStreamInfo(const uint8_t* data, size_t size,
+                                         ObuSequenceHeader* sequence_header,
+                                         size_t* sequence_header_offset,
+                                         size_t* sequence_header_size);
+
+  // Parser elements.
+  std::unique_ptr<RawBitReader> bit_reader_;
+  const uint8_t* data_;
+  size_t size_;
+  const int operating_point_;
+
+  // OBU elements. Only valid if ParseOneFrame() completes successfully.
+  Vector<ObuHeader> obu_headers_;
+  ObuSequenceHeader sequence_header_ = {};
+  ObuFrameHeader frame_header_ = {};
+  Vector<TileBuffer> tile_buffers_;
+  // The expected starting tile number of the next Tile Group.
+  int next_tile_group_start_ = 0;
+  // If true, the sequence_header_ field is valid.
+  bool has_sequence_header_ = false;
+  // If true, it means that the last call to ParseOneFrame() encountered a
+  // sequence header change.
+  bool sequence_header_changed_ = false;
+  // If true, the obu_extension_flag syntax element in the OBU header must be
+  // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
+  bool extension_disallowed_ = false;
+
+  BufferPool* const buffer_pool_;
+  DecoderState& decoder_state_;
+  // Used by ParseOneFrame() to populate the current frame that is being
+  // decoded. The invariant maintained is that this variable will be nullptr at
+  // the beginning and at the end of each call to ParseOneFrame(). This ensures
+  // that the ObuParser is not holding on to any references to the current
+  // frame once the ParseOneFrame() call is complete.
+  RefCountedBufferPtr current_frame_;
+
+  // For unit testing private functions.
+  friend class ObuParserTest;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_OBU_PARSER_H_
diff --git a/src/obu_parser_test.cc b/src/obu_parser_test.cc
new file mode 100644 (file)
index 0000000..a471037
--- /dev/null
@@ -0,0 +1,2677 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+// Note the following test classes access private functions/members of
+// ObuParser. To be declared friends of ObuParser they must not have internal
+// linkage (they must be outside the anonymous namespace).
+namespace libgav1 {
+
+// Helper class to manipulate individual bits and generate a byte string.
+class BytesAndBits {
+ public:
+  // Append a bit to the end.
+  void AppendBit(uint8_t bit) { bits_.push_back(bit != 0); }
+
+  // Append a byte to the end.
+  void AppendByte(uint8_t byte) {
+    for (int i = 0; i < 8; ++i) {
+      AppendBit(GetNthBit(byte, i, 8));
+    }
+  }
+
+  // Append a literal of size |bits| to the end.
+  void AppendLiteral(int bits, int value) {
+    InsertLiteral(static_cast<int>(bits_.size()), bits, value);
+  }
+
+  // Append an inverse signed literal to the end. |bits + 1| bits are appended.
+  void AppendInverseSignedLiteral(int bits, int value) {
+    InsertInverseSignedLiteral(static_cast<int>(bits_.size()), bits, value);
+  }
+
+  // Append a sequence of bytes to the end.
+  void AppendBytes(const std::vector<uint8_t>& bytes) {
+    for (const auto& byte : bytes) {
+      AppendByte(byte);
+    }
+  }
+
+  // Insert |bit| in |offset|. Moves all other bits to the right by 1.
+  void InsertBit(int offset, uint8_t bit) {
+    auto iterator = bits_.begin();
+    bits_.insert(iterator + offset, bit != 0);
+  }
+
+  // Insert |value| of size |bits| at offset |offset|. Moves all other bits to
+  // the right by |bits|.
+  void InsertLiteral(int offset, int bits, int value) {
+    for (int i = 0; i < bits; ++i) {
+      InsertBit(i + offset, GetNthBit(value, i, bits));
+    }
+  }
+
+  // Insert |value| of size |bits| at offset |offset| as an inverse signed
+  // literal. Move all other bits to the right by |bits + 1|.
+  //
+  // Note: This is denoted su(1+bits) in the spec.
+  void InsertInverseSignedLiteral(int offset, int bits, int value) {
+    InsertBit(offset, (value >= 0) ? 0 : 1);
+    InsertLiteral(offset + 1, bits, value);
+  }
+
+  // Insert |value| at |offset| as an unsigned variable length code (uvlc).
+  // Return the number of bits inserted.
+  int InsertUvlc(int offset, int value) {
+    int leading_zeros = 1;
+    int shift_value = ++value;
+    while ((shift_value >>= 1) != 0) leading_zeros += 2;
+    int bits = 0;
+    InsertLiteral(offset, leading_zeros >> 1, 0);
+    bits += leading_zeros >> 1;
+    InsertLiteral(offset + bits, (leading_zeros + 1) >> 1, value);
+    bits += (leading_zeros + 1) >> 1;
+    return bits;
+  }
+
+  // Set the bit at |offset| to |bit|. The bit should already exist.
+  void SetBit(int offset, uint8_t bit) { bits_[offset] = bit != 0; }
+
+  // Set |bits| starting at |offset| to |value|. The bits should already exist.
+  void SetLiteral(int offset, int bits, int value) {
+    for (int i = 0; i < bits; ++i) {
+      SetBit(offset + i, GetNthBit(value, i, bits));
+    }
+  }
+
+  // Remove a bit in |offset|. Moves over all the following bits to the left by
+  // 1.
+  void RemoveBit(int offset) { RemoveLiteral(offset, 1); }
+
+  // Remove a literal of size |bits| from |offset|. Moves over all the
+  // following bits to the left by |bits|.
+  void RemoveLiteral(int offset, int bits) {
+    bits_.erase(bits_.begin() + offset, bits_.begin() + offset + bits);
+  }
+
+  // Remove all bits after offset.
+  void RemoveAllBitsAfter(int offset) {
+    RemoveLiteral(offset, static_cast<int>(bits_.size()) - offset);
+  }
+
+  // Clear all the bits stored.
+  void Clear() { bits_.clear(); }
+
+  // Generate the data vector from the bits. Pads 0 to the end of the last byte
+  // if necessary.
+  const std::vector<uint8_t>& GenerateData() {
+    data_.clear();
+    for (size_t i = 0; i < bits_.size(); i += 8) {
+      uint8_t byte = 0;
+      for (int j = 0; j < 8; ++j) {
+        const uint8_t bit =
+            ((i + j) < bits_.size()) ? static_cast<uint8_t>(bits_[i + j]) : 0;
+        byte |= bit << (7 - j);
+      }
+      data_.push_back(byte);
+    }
+    return data_;
+  }
+
+ private:
+  // Get the |n|th MSB from |value| with the assumption that |value| has |size|
+  // bits.
+  static uint8_t GetNthBit(int value, int n, int size) {
+    return (value >> (size - n - 1)) & 0x01;
+  }
+
+  std::vector<uint8_t> data_;
+  std::vector<bool> bits_;
+};
+
+class ObuParserTest : public testing::Test {
+ protected:
+  // Constants for unit tests.
+  static constexpr int kFrameWidthBits = 9;
+  static constexpr int kFrameHeightBits = 8;
+  static constexpr int kHeight = 240;
+  static constexpr int kWidth = 426;
+  static constexpr int kRows4x4 = 60;
+  static constexpr int kColumns4x4 = 108;
+  static constexpr int kFrameToShow = 2;
+  static constexpr int kDisplayFrameId = 10;
+  static constexpr int kFrameIdLengthBits = 15;
+  static constexpr int kDeltaFrameIdLengthBits = 14;
+
+  // Bit streams for testing. These may contain trailing bits and tests may have
+  // to remove some of the trailing bits to keep the boundary alignment.
+  const std::vector<uint8_t> kDefaultTemporalDelimiter = {0x12, 0x00};
+  // Bits  Syntax element                  Value
+  // 1     obu_forbidden_bit               0
+  // 4     obu_type                        2 (OBU_TEMPORAL_DELIMITER)
+  // 1     obu_extension_flag              1
+  // 1     obu_has_size_field              1
+  // 1     obu_reserved_1bit               0
+  // 3     temporal_id                     6
+  // 2     spatial_id                      2
+  // 3     extension_header_reserved_3bits 0
+  // 8     obu_size                        0
+  const std::vector<uint8_t> kDefaultTemporalDelimiterWithExtension = {
+      0x16, 0xd0, 0x00};
+  const std::vector<uint8_t> kDefaultHeaderWithoutSizeField = {0x10};
+  // Offset  Bits  Syntax element                     Value
+  // 0       3     seq_profile                        0
+  // 3       1     still_picture                      0
+  // 4       1     reduced_still_picture_header       0
+  // 5       1     timing_info_present_flag           0
+  // 6       1     initial_display_delay_present_flag 0
+  // 7       5     operating_points_cnt_minus_1       0
+  // 12      12    operating_point_idc[ 0 ]           0
+  // 24      5     seq_level_idx[ 0 ]                 0
+  // 29      4     frame_width_bits_minus_1           8
+  // 33      4     frame_height_bits_minus_1          7
+  // 37      9     max_frame_width_minus_1            425
+  // 46      8     max_frame_height_minus_1           239
+  // 54      1     frame_id_numbers_present_flag      0
+  // 55      1     use_128x128_superblock             1
+  // 56      1     enable_filter_intra                1
+  // 57      1     enable_intra_edge_filter           1
+  // 58      1     enable_interintra_compound         1
+  // 59      1     enable_masked_compound             1
+  // 60      1     enable_warped_motion               0
+  // 61      1     enable_dual_filter                 1
+  // 62      1     enable_order_hint                  1
+  // 63      1     enable_jnt_comp                    1
+  // 64      1     enable_ref_frame_mvs               1
+  // 65      1     seq_choose_screen_content_tools    1
+  // 66      1     seq_choose_integer_mv              1
+  // 67      3     order_hint_bits_minus_1            6
+  // 70      1     enable_superres                    0
+  // 71      1     enable_cdef                        1
+  // 72      1     enable_restoration                 1
+  // ...
+  const std::vector<uint8_t> kDefaultSequenceHeader = {
+      0x00, 0x00, 0x00, 0x04, 0x3e, 0xa7, 0xbd, 0xf7, 0xf9, 0x80, 0x40};
+  const std::vector<uint8_t> kDefaultFrameHeaderKeyFrame = {0x10, 0x00};
+  // Bits  Syntax element           Value
+  // 1     show_existing_frame      0
+  // 2     frame_type               2 (kFrameIntraOnly)
+  // 1     show_frame               1
+  // 1     error_resilient_mode     0
+  // 1     disable_cdf_update       0
+  // 1     frame_size_override_flag 0
+  // 8     refresh_frame_flags      4
+  // ...
+  const std::vector<uint8_t> kDefaultFrameHeaderIntraOnlyFrame = {0x50, 0x08,
+                                                                  0x00};
+  // Bits  Syntax element           Value
+  // 1     show_existing_frame      0
+  // 2     frame_type               1 (kFrameInter)
+  // 1     show_frame               1
+  // 1     error_resilient_mode     0
+  // 1     disable_cdf_update       0
+  // 1     frame_size_override_flag 0
+  // 3     primary_ref_frame        1
+  // 8     refresh_frame_flags      4
+  // 3     ref_frame_idx[0]         0
+  // 3     ref_frame_idx[1]         1
+  // 3     ref_frame_idx[2]         2
+  // 3     ref_frame_idx[3]         3
+  // 3     ref_frame_idx[4]         4
+  // 3     ref_frame_idx[5]         5
+  // 3     ref_frame_idx[6]         6
+  // ...
+  const std::vector<uint8_t> kDefaultFrameHeaderInterFrame = {0x30, 0x41, 0x01,
+                                                              0x4e, 0x5c, 0x60};
+  const std::vector<uint8_t> kDefaultGlobalMotionParametersRotZoom = {
+      0xff, 0x50, 0x77, 0x7e, 0x1f, 0xcd};
+  const std::vector<uint8_t> kDefaultGlobalMotionParametersAffine = {
+      0x3f, 0x50, 0x77, 0x7b, 0xbf, 0xa8, 0x3e, 0x1f, 0xcd};
+
+  void SetUp() override {
+    buffer_pool_.reset(new (std::nothrow)
+                           BufferPool(nullptr, nullptr, nullptr, nullptr));
+    ASSERT_NE(buffer_pool_, nullptr);
+  }
+
+  bool Init() {
+    obu_.reset(new (std::nothrow) ObuParser(nullptr, 0, 0, buffer_pool_.get(),
+                                            &decoder_state_));
+    if (obu_ == nullptr) return false;
+    obu_headers_ = &obu_->obu_headers_;
+    obu_frame_header_ = &obu_->frame_header_;
+    obu_sequence_header_ = &obu_->sequence_header_;
+    return true;
+  }
+
+  bool Init(const std::vector<uint8_t>& data, bool init_bit_reader = true) {
+    obu_.reset(new (std::nothrow) ObuParser(
+        data.data(), data.size(), 0, buffer_pool_.get(), &decoder_state_));
+    if (obu_ == nullptr) return false;
+    obu_headers_ = &obu_->obu_headers_;
+    obu_frame_header_ = &obu_->frame_header_;
+    obu_sequence_header_ = &obu_->sequence_header_;
+    return init_bit_reader ? obu_->InitBitReader(data.data(), data.size())
+                           : true;
+  }
+
+  bool Parse(const std::string& input,
+             const ObuSequenceHeader* const sequence_header = nullptr) {
+    std::vector<uint8_t> data(input.begin(), input.end());
+    return Parse(data, sequence_header);
+  }
+
+  bool Parse(const std::vector<uint8_t>& data,
+             const ObuSequenceHeader* const sequence_header = nullptr) {
+    EXPECT_TRUE(Init(data, false));
+    if (sequence_header != nullptr) obu_->set_sequence_header(*sequence_header);
+    return obu_->ParseOneFrame(&current_frame_) == kStatusOk;
+  }
+
+  bool ParseSequenceHeader(const std::vector<uint8_t>& data) {
+    EXPECT_TRUE(Init(data));
+    return obu_->ParseSequenceHeader(/*seen_frame_header=*/false);
+  }
+
+  bool ParseFrameParameters(const std::vector<uint8_t>& data,
+                            bool id_bits_present = false,
+                            int force_screen_content_tools = 0,
+                            int force_integer_mv = 0,
+                            bool enable_superres = false) {
+    EXPECT_TRUE(Init(data));
+    if (id_bits_present) {
+      obu_->sequence_header_.frame_id_numbers_present = true;
+      obu_->sequence_header_.frame_id_length_bits = kFrameIdLengthBits;
+      obu_->sequence_header_.delta_frame_id_length_bits =
+          kDeltaFrameIdLengthBits;
+    }
+    obu_->sequence_header_.force_screen_content_tools =
+        force_screen_content_tools;
+    obu_->sequence_header_.force_integer_mv = force_integer_mv;
+    obu_->sequence_header_.enable_superres = enable_superres;
+    obu_->sequence_header_.frame_width_bits = kFrameWidthBits;
+    obu_->sequence_header_.frame_height_bits = kFrameHeightBits;
+    obu_->sequence_header_.max_frame_width = kWidth;
+    obu_->sequence_header_.max_frame_height = kHeight;
+    return obu_->ParseFrameParameters();
+  }
+
+  bool ParseSegmentationParameters(const std::vector<uint8_t>& data,
+                                   int primary_reference_frame,
+                                   int prev_frame_index) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.primary_reference_frame = primary_reference_frame;
+    if (primary_reference_frame != kPrimaryReferenceNone) {
+      obu_->frame_header_.reference_frame_index[primary_reference_frame] =
+          prev_frame_index;
+    }
+    return obu_->ParseSegmentationParameters();
+  }
+
+  bool ParseFrameReferenceModeSyntax(const std::vector<uint8_t>& data,
+                                     FrameType frame_type) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.frame_type = frame_type;
+    return obu_->ParseFrameReferenceModeSyntax();
+  }
+
+  bool ParseGlobalMotionParameters(const std::vector<uint8_t>& data,
+                                   FrameType frame_type) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.frame_type = frame_type;
+    obu_->frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+    return obu_->ParseGlobalMotionParameters();
+  }
+
+  bool ParseFilmGrainParameters(const std::vector<uint8_t>& data,
+                                const ObuSequenceHeader& sequence_header,
+                                const ObuFrameHeader& frame_header) {
+    EXPECT_TRUE(Init(data));
+    obu_->set_sequence_header(sequence_header);
+    obu_->frame_header_ = frame_header;
+    return obu_->ParseFilmGrainParameters();
+  }
+
+  bool ParseTileInfoSyntax(const std::vector<uint8_t>& data, int columns4x4,
+                           int rows4x4, bool use_128x128_superblock) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.columns4x4 = columns4x4;
+    obu_->frame_header_.rows4x4 = rows4x4;
+    obu_->sequence_header_.use_128x128_superblock = use_128x128_superblock;
+    return obu_->ParseTileInfoSyntax();
+  }
+
+  bool ParseMetadata(const std::vector<uint8_t>& data) {
+    EXPECT_TRUE(Init(data));
+    return obu_->ParseMetadata(data.data(), data.size());
+  }
+
+  void DefaultSequenceHeader(ObuSequenceHeader* const gold) {
+    memset(gold, 0, sizeof(*gold));
+    gold->profile = kProfile0;
+    gold->level[0].major = kMinimumMajorBitstreamLevel;
+    gold->operating_points = 1;
+    gold->max_frame_width = kWidth;
+    gold->max_frame_height = kHeight;
+    gold->frame_width_bits = kFrameWidthBits;
+    gold->frame_height_bits = kFrameHeightBits;
+    gold->use_128x128_superblock = true;
+    gold->enable_filter_intra = true;
+    gold->enable_intra_edge_filter = true;
+    gold->enable_interintra_compound = true;
+    gold->enable_masked_compound = true;
+    gold->enable_dual_filter = true;
+    gold->enable_order_hint = true;
+    gold->enable_jnt_comp = true;
+    gold->enable_ref_frame_mvs = true;
+    gold->choose_screen_content_tools = true;
+    gold->force_screen_content_tools = 2;
+    gold->choose_integer_mv = true;
+    gold->force_integer_mv = 2;
+    gold->order_hint_bits = 7;
+    gold->enable_cdef = true;
+    gold->enable_restoration = true;
+    gold->color_config.bitdepth = 8;
+    gold->color_config.color_primary = kColorPrimaryUnspecified;
+    gold->color_config.transfer_characteristics =
+        kTransferCharacteristicsUnspecified;
+    gold->color_config.matrix_coefficients = kMatrixCoefficientsUnspecified;
+    gold->color_config.subsampling_x = 1;
+    gold->color_config.subsampling_y = 1;
+  }
+
+  void DefaultFrameHeader(ObuFrameHeader* const gold, FrameType frame_type) {
+    memset(gold, 0, sizeof(*gold));
+    gold->frame_type = frame_type;
+    gold->show_frame = true;
+    gold->showable_frame = (frame_type != kFrameKey);
+    gold->enable_cdf_update = true;
+    gold->width = kWidth;
+    gold->height = kHeight;
+    gold->render_width = kWidth;
+    gold->render_height = kHeight;
+    gold->upscaled_width = kWidth;
+    gold->primary_reference_frame = kPrimaryReferenceNone;
+    gold->enable_frame_end_update_cdf = true;
+    gold->rows4x4 = kRows4x4;
+    gold->columns4x4 = kColumns4x4;
+    if (frame_type == kFrameKey) {
+      gold->refresh_frame_flags = 0xff;
+      gold->error_resilient_mode = true;
+      gold->force_integer_mv = 1;
+    } else if (frame_type == kFrameIntraOnly) {
+      gold->refresh_frame_flags = 4;
+      gold->force_integer_mv = 1;
+    } else if (frame_type == kFrameInter) {
+      gold->refresh_frame_flags = 4;
+      gold->primary_reference_frame = 1;
+      for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+        gold->reference_frame_index[i] = i;
+      }
+      gold->is_motion_mode_switchable = true;
+    }
+  }
+
+  void OverrideFrameSize(BytesAndBits* const data, ObuFrameHeader* const gold,
+                         int flag_offset, int size_offset) {
+    data->SetBit(flag_offset, 1);  // frame_size_override_flag.
+    data->InsertLiteral(size_offset, kFrameWidthBits,
+                        kWidth - 2);  // frame_width_minus_1.
+    data->InsertLiteral(size_offset + kFrameWidthBits, kFrameHeightBits,
+                        kHeight - 2);  // frame_height_minus_1.
+    gold->frame_size_override_flag = true;
+    gold->width = kWidth - 1;
+    gold->height = kHeight - 1;
+    gold->render_width = gold->width;
+    gold->render_height = gold->height;
+    gold->upscaled_width = gold->width;
+  }
+
+  void OverrideRenderSize(BytesAndBits* const data, ObuFrameHeader* const gold,
+                          int flag_offset) {
+    data->SetBit(flag_offset, 1);  // render_and_frame_size_different.
+    data->InsertLiteral(flag_offset + 1, 16,
+                        kWidth - 10);  // render_width_minus_1.
+    data->InsertLiteral(flag_offset + 17, 16,
+                        kHeight - 10);  // render_height_minus_1.
+    gold->render_width = kWidth - 9;
+    gold->render_height = kHeight - 9;
+    gold->render_and_frame_size_different = true;
+  }
+
+  void OverrideSegmentation(BytesAndBits* const data, Segmentation* const gold,
+                            int offset) {
+    gold->update_data = true;
+    data->SetBit(offset++, static_cast<uint8_t>(gold->update_data));
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    gold->segment_id_pre_skip = false;
+    gold->last_active_segment_id = 0;
+    for (int i = 0; i < kMaxSegments; ++i) {
+      for (int j = 0; j < kSegmentFeatureMax; ++j) {
+        gold->feature_enabled[i][j] = static_cast<bool>(rnd.Rand8() & 1);
+        data->InsertBit(offset++,
+                        static_cast<uint8_t>(gold->feature_enabled[i][j]));
+        if (gold->feature_enabled[i][j]) {
+          gold->feature_data[i][j] = rnd(1 << kSegmentationFeatureBits[j]);
+          if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+            if (static_cast<bool>(rnd.Rand8() & 1)) {
+              gold->feature_data[i][j] *= -1;
+            }
+            data->InsertInverseSignedLiteral(
+                offset, kSegmentationFeatureBits[j], gold->feature_data[i][j]);
+            offset += kSegmentationFeatureBits[j] + 1;
+          } else {
+            data->InsertLiteral(offset, kSegmentationFeatureBits[j],
+                                gold->feature_data[i][j]);
+            offset += kSegmentationFeatureBits[j];
+          }
+          gold->last_active_segment_id = i;
+          if (j >= kSegmentFeatureReferenceFrame) {
+            gold->segment_id_pre_skip = true;
+          }
+        }
+      }
+    }
+  }
+
+  void VerifyObuHeader(bool extension) {
+    EXPECT_EQ(obu_->obu_headers().back().temporal_id, extension ? 6 : 0);
+    EXPECT_EQ(obu_->obu_headers().back().spatial_id, extension ? 2 : 0);
+  }
+
+#define OBU_TEST_COMPARE(x) EXPECT_EQ(expected.x, actual.x)
+  void VerifyFrameParameters(const ObuFrameHeader& expected,
+                             bool id_bits_present = false) {
+    const ObuFrameHeader& actual = obu_->frame_header();
+    OBU_TEST_COMPARE(show_existing_frame);
+    if (actual.show_existing_frame) {
+      OBU_TEST_COMPARE(frame_to_show);
+      OBU_TEST_COMPARE(frame_presentation_time);
+      if (id_bits_present) {
+        OBU_TEST_COMPARE(display_frame_id);
+      }
+      return;
+    }
+    OBU_TEST_COMPARE(frame_type);
+    OBU_TEST_COMPARE(show_frame);
+    OBU_TEST_COMPARE(frame_presentation_time);
+    OBU_TEST_COMPARE(showable_frame);
+    OBU_TEST_COMPARE(error_resilient_mode);
+    OBU_TEST_COMPARE(enable_cdf_update);
+    OBU_TEST_COMPARE(current_frame_id);
+    OBU_TEST_COMPARE(frame_size_override_flag);
+    OBU_TEST_COMPARE(order_hint);
+    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+      OBU_TEST_COMPARE(reference_order_hint[i]);
+    }
+    OBU_TEST_COMPARE(primary_reference_frame);
+    OBU_TEST_COMPARE(width);
+    OBU_TEST_COMPARE(height);
+    OBU_TEST_COMPARE(render_and_frame_size_different);
+    OBU_TEST_COMPARE(render_width);
+    OBU_TEST_COMPARE(render_height);
+    OBU_TEST_COMPARE(upscaled_width);
+    OBU_TEST_COMPARE(coded_lossless);
+    OBU_TEST_COMPARE(upscaled_lossless);
+    OBU_TEST_COMPARE(allow_screen_content_tools);
+    OBU_TEST_COMPARE(is_motion_mode_switchable);
+    OBU_TEST_COMPARE(refresh_frame_flags);
+    OBU_TEST_COMPARE(enable_frame_end_update_cdf);
+    OBU_TEST_COMPARE(force_integer_mv);
+    if (actual.frame_type == kFrameInter) {
+      for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+        OBU_TEST_COMPARE(reference_frame_index[i]);
+      }
+    }
+    OBU_TEST_COMPARE(use_superres);
+    OBU_TEST_COMPARE(rows4x4);
+    OBU_TEST_COMPARE(columns4x4);
+  }
+
+  void VerifyLoopFilterParameters(const LoopFilter& expected) {
+    const LoopFilter& actual = obu_->frame_header().loop_filter;
+    for (int i = 0; i < 4; ++i) {
+      OBU_TEST_COMPARE(level[i]);
+    }
+    OBU_TEST_COMPARE(sharpness);
+    OBU_TEST_COMPARE(delta_enabled);
+    OBU_TEST_COMPARE(delta_update);
+    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+      OBU_TEST_COMPARE(ref_deltas[i]);
+    }
+    for (int i = 0; i < kLoopFilterMaxModeDeltas; ++i) {
+      OBU_TEST_COMPARE(mode_deltas[i]);
+    }
+  }
+
+  void VerifyQuantizerParameters(const QuantizerParameters& expected) {
+    const QuantizerParameters& actual = obu_->frame_header().quantizer;
+    OBU_TEST_COMPARE(base_index);
+    OBU_TEST_COMPARE(delta_dc[kPlaneY]);
+    OBU_TEST_COMPARE(delta_dc[kPlaneU]);
+    OBU_TEST_COMPARE(delta_dc[kPlaneV]);
+    EXPECT_EQ(0, actual.delta_ac[kPlaneY]);
+    OBU_TEST_COMPARE(delta_ac[kPlaneY]);
+    OBU_TEST_COMPARE(delta_ac[kPlaneU]);
+    OBU_TEST_COMPARE(delta_ac[kPlaneV]);
+    OBU_TEST_COMPARE(use_matrix);
+    OBU_TEST_COMPARE(matrix_level[kPlaneY]);
+    OBU_TEST_COMPARE(matrix_level[kPlaneU]);
+    OBU_TEST_COMPARE(matrix_level[kPlaneV]);
+  }
+
+  void VerifySegmentationParameters(const Segmentation& expected) {
+    const Segmentation& actual = obu_->frame_header().segmentation;
+    OBU_TEST_COMPARE(enabled);
+    OBU_TEST_COMPARE(update_map);
+    OBU_TEST_COMPARE(update_data);
+    OBU_TEST_COMPARE(temporal_update);
+    OBU_TEST_COMPARE(segment_id_pre_skip);
+    OBU_TEST_COMPARE(last_active_segment_id);
+    for (int i = 0; i < kMaxSegments; ++i) {
+      for (int j = 0; j < kSegmentFeatureMax; ++j) {
+        OBU_TEST_COMPARE(feature_enabled[i][j]);
+        OBU_TEST_COMPARE(feature_data[i][j]);
+      }
+    }
+  }
+
+  void VerifyDeltaParameters(const Delta& expected, const Delta& actual) {
+    OBU_TEST_COMPARE(present);
+    OBU_TEST_COMPARE(scale);
+    OBU_TEST_COMPARE(multi);
+  }
+
+  void VerifyCdefParameters(const Cdef& expected) {
+    const Cdef& actual = obu_->frame_header().cdef;
+    OBU_TEST_COMPARE(damping);
+    OBU_TEST_COMPARE(bits);
+    for (int i = 0; i < (1 << actual.bits); ++i) {
+      OBU_TEST_COMPARE(y_primary_strength[i]);
+      OBU_TEST_COMPARE(y_secondary_strength[i]);
+      OBU_TEST_COMPARE(uv_primary_strength[i]);
+      OBU_TEST_COMPARE(uv_secondary_strength[i]);
+    }
+  }
+
+  void VerifyLoopRestorationParameters(const LoopRestoration& expected) {
+    const LoopRestoration& actual = obu_->frame_header().loop_restoration;
+    for (int i = 0; i < kMaxPlanes; ++i) {
+      OBU_TEST_COMPARE(type[i]);
+      OBU_TEST_COMPARE(unit_size_log2[i]);
+    }
+  }
+
+  void VerifyGlobalMotionParameters(
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>& gold) {
+    for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+      const GlobalMotion& expected = gold[i];
+      const GlobalMotion& actual = obu_->frame_header().global_motion[i];
+      OBU_TEST_COMPARE(type) << " i: " << i;
+      for (int j = 0; j < 6; ++j) {
+        OBU_TEST_COMPARE(params[j]) << " i: " << i << " j: " << j;
+      }
+    }
+  }
+
+  void VerifyFilmGrainParameters(const FilmGrainParams& expected) {
+    const FilmGrainParams& actual = obu_->frame_header().film_grain_params;
+    OBU_TEST_COMPARE(apply_grain);
+    OBU_TEST_COMPARE(update_grain);
+    OBU_TEST_COMPARE(chroma_scaling_from_luma);
+    OBU_TEST_COMPARE(overlap_flag);
+    OBU_TEST_COMPARE(clip_to_restricted_range);
+    OBU_TEST_COMPARE(num_y_points);
+    OBU_TEST_COMPARE(num_u_points);
+    OBU_TEST_COMPARE(num_v_points);
+    for (int i = 0; i < 14; ++i) {
+      OBU_TEST_COMPARE(point_y_value[i]);
+      OBU_TEST_COMPARE(point_y_scaling[i]);
+    }
+    for (int i = 0; i < 10; ++i) {
+      OBU_TEST_COMPARE(point_u_value[i]);
+      OBU_TEST_COMPARE(point_u_scaling[i]);
+    }
+    for (int i = 0; i < 10; ++i) {
+      OBU_TEST_COMPARE(point_v_value[i]);
+      OBU_TEST_COMPARE(point_v_scaling[i]);
+    }
+    OBU_TEST_COMPARE(chroma_scaling);
+    OBU_TEST_COMPARE(auto_regression_coeff_lag);
+    for (int i = 0; i < 24; ++i) {
+      OBU_TEST_COMPARE(auto_regression_coeff_y[i]);
+    }
+    for (int i = 0; i < 25; ++i) {
+      OBU_TEST_COMPARE(auto_regression_coeff_u[i]);
+    }
+    for (int i = 0; i < 25; ++i) {
+      OBU_TEST_COMPARE(auto_regression_coeff_v[i]);
+    }
+    OBU_TEST_COMPARE(auto_regression_shift);
+    OBU_TEST_COMPARE(grain_seed);
+    OBU_TEST_COMPARE(reference_index);
+    OBU_TEST_COMPARE(grain_scale_shift);
+    OBU_TEST_COMPARE(u_multiplier);
+    OBU_TEST_COMPARE(u_luma_multiplier);
+    OBU_TEST_COMPARE(u_offset);
+    OBU_TEST_COMPARE(v_multiplier);
+    OBU_TEST_COMPARE(v_luma_multiplier);
+    OBU_TEST_COMPARE(v_offset);
+  }
+
+  void VerifyTileInfoParameters(const TileInfo& expected) {
+    const TileInfo& actual = obu_->frame_header().tile_info;
+    OBU_TEST_COMPARE(uniform_spacing);
+    OBU_TEST_COMPARE(tile_columns_log2);
+    OBU_TEST_COMPARE(tile_columns);
+    for (int i = 0; i < kMaxTileColumns + 1; ++i) {
+      OBU_TEST_COMPARE(tile_column_start[i]) << "tile_column: " << i;
+      OBU_TEST_COMPARE(tile_column_width_in_superblocks[i])
+          << "tile_column: " << i;
+    }
+    OBU_TEST_COMPARE(tile_rows_log2);
+    OBU_TEST_COMPARE(tile_rows);
+    for (int i = 0; i < kMaxTileRows + 1; ++i) {
+      OBU_TEST_COMPARE(tile_row_start[i]) << "tile_row: " << i;
+      OBU_TEST_COMPARE(tile_row_height_in_superblocks[i]) << "tile_rows: " << i;
+    }
+    OBU_TEST_COMPARE(tile_count);
+    OBU_TEST_COMPARE(context_update_id);
+    OBU_TEST_COMPARE(tile_size_bytes);
+  }
+
+  void VerifySequenceHeader(const ObuSequenceHeader& expected) {
+    EXPECT_TRUE(obu_->sequence_header_changed());
+    const ObuSequenceHeader& actual = obu_->sequence_header();
+    OBU_TEST_COMPARE(profile);
+    OBU_TEST_COMPARE(still_picture);
+    OBU_TEST_COMPARE(reduced_still_picture_header);
+    OBU_TEST_COMPARE(operating_points);
+    for (int i = 0; i < actual.operating_points; ++i) {
+      OBU_TEST_COMPARE(operating_point_idc[i]) << "i: " << i;
+      OBU_TEST_COMPARE(level[i].major) << "i: " << i;
+      OBU_TEST_COMPARE(level[i].minor) << "i: " << i;
+      OBU_TEST_COMPARE(tier[i]) << "i: " << i;
+    }
+    OBU_TEST_COMPARE(frame_width_bits);
+    OBU_TEST_COMPARE(frame_height_bits);
+    OBU_TEST_COMPARE(max_frame_width);
+    OBU_TEST_COMPARE(max_frame_height);
+    OBU_TEST_COMPARE(frame_id_numbers_present);
+    if (actual.frame_id_numbers_present) {
+      OBU_TEST_COMPARE(frame_id_length_bits);
+      OBU_TEST_COMPARE(delta_frame_id_length_bits);
+    }
+    OBU_TEST_COMPARE(use_128x128_superblock);
+    OBU_TEST_COMPARE(enable_filter_intra);
+    OBU_TEST_COMPARE(enable_intra_edge_filter);
+    OBU_TEST_COMPARE(enable_interintra_compound);
+    OBU_TEST_COMPARE(enable_masked_compound);
+    OBU_TEST_COMPARE(enable_warped_motion);
+    OBU_TEST_COMPARE(enable_dual_filter);
+    OBU_TEST_COMPARE(enable_order_hint);
+    OBU_TEST_COMPARE(enable_jnt_comp);
+    OBU_TEST_COMPARE(enable_ref_frame_mvs);
+    OBU_TEST_COMPARE(choose_screen_content_tools);
+    OBU_TEST_COMPARE(force_screen_content_tools);
+    OBU_TEST_COMPARE(choose_integer_mv);
+    OBU_TEST_COMPARE(force_integer_mv);
+    OBU_TEST_COMPARE(order_hint_bits);
+    OBU_TEST_COMPARE(enable_superres);
+    OBU_TEST_COMPARE(enable_cdef);
+    OBU_TEST_COMPARE(enable_restoration);
+    OBU_TEST_COMPARE(color_config.bitdepth);
+    OBU_TEST_COMPARE(color_config.is_monochrome);
+    OBU_TEST_COMPARE(color_config.color_range);
+    OBU_TEST_COMPARE(color_config.subsampling_x);
+    OBU_TEST_COMPARE(color_config.subsampling_y);
+    OBU_TEST_COMPARE(color_config.chroma_sample_position);
+    OBU_TEST_COMPARE(timing_info_present_flag);
+    OBU_TEST_COMPARE(timing_info.num_units_in_tick);
+    OBU_TEST_COMPARE(timing_info.time_scale);
+    OBU_TEST_COMPARE(timing_info.equal_picture_interval);
+    OBU_TEST_COMPARE(timing_info.num_ticks_per_picture);
+    OBU_TEST_COMPARE(decoder_model_info_present_flag);
+    OBU_TEST_COMPARE(decoder_model_info.encoder_decoder_buffer_delay_length);
+    OBU_TEST_COMPARE(decoder_model_info.num_units_in_decoding_tick);
+    OBU_TEST_COMPARE(decoder_model_info.buffer_removal_time_length);
+    OBU_TEST_COMPARE(decoder_model_info.frame_presentation_time_length);
+    for (int i = 0; i < actual.operating_points; ++i) {
+      SCOPED_TRACE("i: " + std::to_string(i));
+      OBU_TEST_COMPARE(operating_parameters.decoder_buffer_delay[i]);
+      OBU_TEST_COMPARE(operating_parameters.encoder_buffer_delay[i]);
+      OBU_TEST_COMPARE(operating_parameters.low_delay_mode_flag[i]);
+      OBU_TEST_COMPARE(initial_display_delay[i]);
+    }
+    OBU_TEST_COMPARE(film_grain_params_present);
+  }
+
+  void VerifyMetadataHdrCll(const ObuMetadataHdrCll& expected) {
+    EXPECT_TRUE(obu_->current_frame_->hdr_cll_set());
+    const ObuMetadataHdrCll& actual = obu_->current_frame_->hdr_cll();
+    OBU_TEST_COMPARE(max_cll);
+    OBU_TEST_COMPARE(max_fall);
+  }
+
+  void VerifyMetadataHdrMdcv(const ObuMetadataHdrMdcv& expected) {
+    EXPECT_TRUE(obu_->current_frame_->hdr_mdcv_set());
+    const ObuMetadataHdrMdcv& actual = obu_->current_frame_->hdr_mdcv();
+    for (int i = 0; i < 3; ++i) {
+      OBU_TEST_COMPARE(primary_chromaticity_x[i]);
+      OBU_TEST_COMPARE(primary_chromaticity_y[i]);
+    }
+    OBU_TEST_COMPARE(white_point_chromaticity_x);
+    OBU_TEST_COMPARE(white_point_chromaticity_y);
+    OBU_TEST_COMPARE(luminance_max);
+    OBU_TEST_COMPARE(luminance_min);
+  }
+
+  void VerifyMetadataItutT35(const ObuMetadataItutT35& expected) {
+    EXPECT_TRUE(obu_->current_frame_->itut_t35_set());
+    const ObuMetadataItutT35& actual = obu_->current_frame_->itut_t35();
+    OBU_TEST_COMPARE(country_code);
+    if (actual.country_code == 0xFF) {
+      OBU_TEST_COMPARE(country_code_extension_byte);
+    }
+    ASSERT_EQ(expected.payload_size, actual.payload_size);
+    if (actual.payload_size != 0) {
+      EXPECT_EQ(memcmp(expected.payload_bytes, actual.payload_bytes,
+                       actual.payload_size),
+                0);
+    }
+  }
+
+#undef OBU_TEST_COMPARE
+
+  // Accessors to private members of ObuParser. This avoids the need for a
+  // dependency on a googletest header in the main library for FRIEND_TEST()
+  // (or the need to duplicate the implementation).
+  bool ObuParseFrameParameters() { return obu_->ParseFrameParameters(); }
+  bool ObuParseLoopFilterParameters() {
+    return obu_->ParseLoopFilterParameters();
+  }
+  bool ObuParseLoopFilterDeltaParameters() {
+    return obu_->ParseLoopFilterDeltaParameters();
+  }
+  bool ObuParseQuantizerParameters() {
+    return obu_->ParseQuantizerParameters();
+  }
+  bool ObuParseQuantizerIndexDeltaParameters() {
+    return obu_->ParseQuantizerIndexDeltaParameters();
+  }
+  void ObuComputeSegmentLosslessAndQIndex() {
+    obu_->ComputeSegmentLosslessAndQIndex();
+  }
+  bool ObuParseCdefParameters() { return obu_->ParseCdefParameters(); }
+  bool ObuParseLoopRestorationParameters() {
+    return obu_->ParseLoopRestorationParameters();
+  }
+  bool ObuParseTxModeSyntax() { return obu_->ParseTxModeSyntax(); }
+  bool ObuIsSkipModeAllowed() { return obu_->IsSkipModeAllowed(); }
+  bool ObuParseSkipModeParameters() { return obu_->ParseSkipModeParameters(); }
+  bool ObuReadAllowWarpedMotion() { return obu_->ReadAllowWarpedMotion(); }
+  bool ObuSetFrameReferences(int8_t last_frame_idx, int8_t gold_frame_idx) {
+    return obu_->SetFrameReferences(last_frame_idx, gold_frame_idx);
+  }
+
+  std::unique_ptr<BufferPool> buffer_pool_;
+  DecoderState decoder_state_;
+  std::unique_ptr<ObuParser> obu_;
+  // The following members are reset with each Init().
+  Vector<ObuHeader>* obu_headers_;
+  ObuFrameHeader* obu_frame_header_;
+  ObuSequenceHeader* obu_sequence_header_;
+  RefCountedBufferPtr current_frame_;
+};
+
+TEST_F(ObuParserTest, InvalidInputs) {
+  obu_.reset(new (std::nothrow)
+                 ObuParser(nullptr, 0, 0, buffer_pool_.get(), &decoder_state_));
+  EXPECT_EQ(obu_->ParseOneFrame(&current_frame_), kStatusInvalidArgument);
+  obu_.reset(new (std::nothrow) ObuParser(nullptr, 10, 0, buffer_pool_.get(),
+                                          &decoder_state_));
+  EXPECT_EQ(obu_->ParseOneFrame(&current_frame_), kStatusInvalidArgument);
+  obu_.reset(new (std::nothrow)
+                 ObuParser(kDefaultTemporalDelimiter.data(), 0, 0,
+                           buffer_pool_.get(), &decoder_state_));
+  EXPECT_EQ(obu_->ParseOneFrame(&current_frame_), kStatusInvalidArgument);
+}
+
+TEST_F(ObuParserTest, TemporalDelimiter) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultTemporalDelimiter);
+
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  EXPECT_EQ(obu_->obu_headers().size(), 1);
+  EXPECT_EQ(obu_->obu_headers().back().type, kObuTemporalDelimiter);
+  VerifyObuHeader(false);
+
+  // forbidden_bit is not zero.
+  data.SetBit(0, 1);
+  EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, HeaderExtensions) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultTemporalDelimiterWithExtension);
+
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  EXPECT_EQ(obu_->obu_headers().size(), 1);
+  EXPECT_EQ(obu_->obu_headers().back().type, kObuTemporalDelimiter);
+  VerifyObuHeader(true);
+
+  // extension flag is set but no extensions found.
+  data.Clear();
+  data.AppendByte(kDefaultTemporalDelimiterWithExtension[0]);
+  EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, HeaderHasSizeFieldNotSet) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultHeaderWithoutSizeField);
+
+  EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, SequenceHeader) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderLevel) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  // Set level to 1.
+  gold.level[0].major = 2;
+  gold.level[0].minor = 1;
+  data.SetLiteral(24, 5, 1);  // level.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // Set operating_point_idc of operating point 1 to 0x101 (temporal layer 0
+  // and spatial layer 0 should be decoded). Set level of operating point 1 to
+  // 8 (4.0) and tier to 1.
+  gold.operating_points = 2;
+  gold.operating_point_idc[1] = (1 << 0) | (1 << (0 + 8));
+  gold.level[1].major = 4;
+  gold.level[1].minor = 0;
+  gold.tier[1] = 1;
+  data.SetLiteral(7, 5, gold.operating_points - 1);
+  data.InsertLiteral(29, 12, 0x101);  // operating_point_idc.
+  data.InsertLiteral(41, 5, 8);       // level.
+  data.InsertBit(46, gold.tier[1]);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderProfile) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.still_picture = true;
+  data.SetBit(3, static_cast<uint8_t>(gold.still_picture));
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // profile 2; bitdepth 8;
+  gold.profile = kProfile2;
+  gold.color_config.bitdepth = 8;
+  gold.color_config.subsampling_x = 1;
+  gold.color_config.subsampling_y = 0;
+  data.SetLiteral(0, 3, gold.profile);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // profile 2; bitdepth 10;
+  gold.color_config.bitdepth = 10;
+  data.SetBit(73, 1);     // high_bitdepth.
+  data.InsertBit(74, 0);  // twelve_bit.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // profile 2; bitdepth 12;
+  gold.color_config.bitdepth = 12;
+  gold.color_config.subsampling_y = 1;
+  data.SetBit(74, 1);     // twelve_bit.
+  data.InsertBit(78, 1);  // subsampling_x.
+  data.InsertBit(79, 1);  // subsampling_y.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderIdLength) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.frame_id_numbers_present = true;
+  gold.delta_frame_id_length_bits = kDeltaFrameIdLengthBits;
+  gold.frame_id_length_bits = kFrameIdLengthBits;
+  data.SetBit(54, 1);  // frame_id_numbers_present.
+  data.InsertLiteral(55, 4, kDeltaFrameIdLengthBits - 2);
+  data.InsertLiteral(59, 3, kFrameIdLengthBits - kDeltaFrameIdLengthBits - 1);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+// An idLen greater than 16 is invalid.
+TEST_F(ObuParserTest, SequenceHeaderIdLengthInvalid) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+
+  data.SetBit(54, 1);  // frame_id_numbers_present.
+  data.InsertLiteral(55, 4, kDeltaFrameIdLengthBits - 2);
+  data.InsertLiteral(59, 3, 17 - kDeltaFrameIdLengthBits - 1);  // idLen = 17.
+
+  ASSERT_FALSE(ParseSequenceHeader(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, SequenceHeaderFlags) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.enable_warped_motion = true;
+  gold.enable_superres = true;
+  data.SetBit(60, 1);  // enable_warped_motion.
+  data.SetBit(70, 1);  // enable_superres.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderForceScreenContentToolsEqualTo0) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.choose_screen_content_tools = false;
+  gold.force_screen_content_tools = 0;
+  gold.choose_integer_mv = false;
+  gold.force_integer_mv = 2;
+  data.SetBit(65, 0);  // choose_screen_content_tools.
+  data.SetBit(66, 0);  // force_screen_content_tools.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderMonochrome) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.color_config.is_monochrome = true;
+  gold.color_config.color_range = kColorRangeFull;
+  data.SetBit(74, 1);     // monochrome.
+  data.InsertBit(76, 1);  // color_range.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+// This tests TimingInfo, DecoderModelInfo and OperatingParameters. The test is
+// kind of long but it is the simplest way to test all three since they are
+// dependent on one another.
+TEST_F(ObuParserTest, SequenceHeaderTimingInfo) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.timing_info_present_flag = true;
+  gold.timing_info.num_units_in_tick = 100;
+  gold.timing_info.time_scale = 1000;
+  gold.timing_info.equal_picture_interval = false;
+  gold.decoder_model_info_present_flag = false;
+  data.SetBit(5, static_cast<uint8_t>(gold.timing_info_present_flag));
+  data.InsertLiteral(6, 32, gold.timing_info.num_units_in_tick);
+  data.InsertLiteral(38, 32, gold.timing_info.time_scale);
+  data.InsertBit(70,
+                 static_cast<uint8_t>(gold.timing_info.equal_picture_interval));
+  data.InsertBit(71,
+                 static_cast<uint8_t>(gold.decoder_model_info_present_flag));
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  gold.timing_info.equal_picture_interval = true;
+  gold.timing_info.num_ticks_per_picture = 7;
+  data.SetBit(70,
+              static_cast<uint8_t>(gold.timing_info.equal_picture_interval));
+  EXPECT_EQ(data.InsertUvlc(71, gold.timing_info.num_ticks_per_picture - 1), 5);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  gold.decoder_model_info_present_flag = true;
+  gold.decoder_model_info.encoder_decoder_buffer_delay_length = 5;
+  gold.decoder_model_info.num_units_in_decoding_tick = 1000;
+  gold.decoder_model_info.buffer_removal_time_length = 18;
+  gold.decoder_model_info.frame_presentation_time_length = 20;
+
+  data.SetBit(76, static_cast<uint8_t>(gold.decoder_model_info_present_flag));
+  data.InsertLiteral(
+      77, 5, gold.decoder_model_info.encoder_decoder_buffer_delay_length - 1);
+  data.InsertLiteral(82, 32,
+                     gold.decoder_model_info.num_units_in_decoding_tick);
+  data.InsertLiteral(114, 5,
+                     gold.decoder_model_info.buffer_removal_time_length - 1);
+  data.InsertLiteral(
+      119, 5, gold.decoder_model_info.frame_presentation_time_length - 1);
+  data.InsertBit(147, 0);  // decoder_model_present_for_this_op.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  gold.operating_parameters.decoder_buffer_delay[0] = 10;
+  gold.operating_parameters.encoder_buffer_delay[0] = 20;
+  gold.operating_parameters.low_delay_mode_flag[0] = true;
+
+  data.SetBit(147, 1);  // decoder_model_present_for_this_op.
+  data.InsertLiteral(
+      148, gold.decoder_model_info.encoder_decoder_buffer_delay_length,
+      gold.operating_parameters.decoder_buffer_delay[0]);
+  data.InsertLiteral(
+      153, gold.decoder_model_info.encoder_decoder_buffer_delay_length,
+      gold.operating_parameters.encoder_buffer_delay[0]);
+  data.InsertBit(158, static_cast<uint8_t>(
+                          gold.operating_parameters.low_delay_mode_flag[0]));
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderInitialDisplayDelay) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.initial_display_delay[0] = 8;
+
+  data.SetBit(6, 1);      // initial_display_delay_present_flag.
+  data.InsertBit(29, 1);  // initial_display_delay_present_for_this_op.
+  data.InsertLiteral(30, 4, gold.initial_display_delay[0] - 1);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+// Parsing of a frame header should fail if no sequence header has been
+// received.
+TEST_F(ObuParserTest, FrameHeaderWithoutSequenceHeader) {
+  // The aom-test-data test vector av1-1-b8-01-size-16x16.ivf has two temporal
+  // units. The first temporal unit has a presentation timestamp of 0 and
+  // consists of three OBUs: a temporal delimiter OBU, a sequence header OBU,
+  // and a frame OBU.
+  const std::vector<uint8_t> kTemporalDelimiter = {0x12, 0x00};
+  const std::vector<uint8_t> kSequenceHeader = {
+      0x0a, 0x0a, 0x00, 0x00, 0x00, 0x01, 0x9f, 0xfb, 0xff, 0xf3, 0x00, 0x80};
+  const std::vector<uint8_t> kFrame = {
+      0x32, 0xa6, 0x01, 0x10, 0x00, 0x87, 0x80, 0x00, 0x03, 0x00, 0x00, 0x00,
+      0x40, 0x00, 0x9e, 0x86, 0x5b, 0xb2, 0x22, 0xb5, 0x58, 0x4d, 0x68, 0xe6,
+      0x37, 0x54, 0x42, 0x7b, 0x84, 0xce, 0xdf, 0x9f, 0xec, 0xab, 0x07, 0x4d,
+      0xf6, 0xe1, 0x5e, 0x9e, 0x27, 0xbf, 0x93, 0x2f, 0x47, 0x0d, 0x7b, 0x7c,
+      0x45, 0x8d, 0xcf, 0x26, 0xf7, 0x6c, 0x06, 0xd7, 0x8c, 0x2e, 0xf5, 0x2c,
+      0xb0, 0x8a, 0x31, 0xac, 0x69, 0xf5, 0xcd, 0xd8, 0x71, 0x5d, 0xaf, 0xf8,
+      0x96, 0x43, 0x8c, 0x9c, 0x23, 0x6f, 0xab, 0xd0, 0x35, 0x43, 0xdf, 0x81,
+      0x12, 0xe3, 0x7d, 0xec, 0x22, 0xb0, 0x30, 0x54, 0x32, 0x9f, 0x90, 0xc0,
+      0x5d, 0x64, 0x9b, 0x0f, 0x75, 0x31, 0x84, 0x3a, 0x57, 0xd7, 0x5f, 0x03,
+      0x6e, 0x7f, 0x43, 0x17, 0x6d, 0x08, 0xc3, 0x81, 0x8a, 0xae, 0x73, 0x1c,
+      0xa8, 0xa7, 0xe4, 0x9c, 0xa9, 0x5b, 0x3f, 0xd1, 0xeb, 0x75, 0x3a, 0x7f,
+      0x22, 0x77, 0x38, 0x64, 0x1c, 0x77, 0xdb, 0xcd, 0xef, 0xb7, 0x08, 0x45,
+      0x8e, 0x7f, 0xea, 0xa3, 0xd0, 0x81, 0xc9, 0xc1, 0xbc, 0x93, 0x9b, 0x41,
+      0xb1, 0xa1, 0x42, 0x17, 0x98, 0x3f, 0x1e, 0x95, 0xdf, 0x68, 0x7c, 0xb7,
+      0x98};
+
+  BytesAndBits data;
+  data.AppendBytes(kTemporalDelimiter);
+  // Skip the sequence header OBU.
+  data.AppendBytes(kFrame);
+  ASSERT_FALSE(Parse(data.GenerateData()));
+
+  // Now verify that all three OBUs are correct, by adding them to |data|
+  // successively.
+  data.Clear();
+  data.AppendBytes(kTemporalDelimiter);
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  data.Clear();
+  data.AppendBytes(kTemporalDelimiter);
+  data.AppendBytes(kSequenceHeader);
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  data.Clear();
+  data.AppendBytes(kTemporalDelimiter);
+  data.AppendBytes(kSequenceHeader);
+  data.AppendBytes(kFrame);
+  ASSERT_TRUE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, FrameParameterShowExistingFrame) {
+  BytesAndBits data;
+  data.AppendBit(1);                    // show_existing_frame.
+  data.AppendLiteral(3, kFrameToShow);  // frame_to_show.
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.show_existing_frame = true;
+  gold.frame_to_show = kFrameToShow;
+
+  // kFrameToShow'th frame is not yet decoded.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+
+  decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+  // kFrameToShow'th frame is not a showable frame.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+
+  decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParametersShowExistingFrameWithDisplayFrameId) {
+  BytesAndBits data;
+  data.AppendBit(1);                        // show_existing_frame.
+  data.AppendLiteral(3, kFrameToShow);      // frame_to_show.
+  data.AppendLiteral(15, kDisplayFrameId);  // display_frame_id.
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.show_existing_frame = true;
+  gold.frame_to_show = kFrameToShow;
+  gold.display_frame_id = kDisplayFrameId;
+
+  // kFrameToShow'th frame is not yet decoded.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData(), true));
+
+  decoder_state_.reference_frame_id[kFrameToShow] = kDisplayFrameId;
+  decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+  // kFrameToShow'th frame is not a showable frame.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData(), true));
+
+  decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), true));
+  VerifyFrameParameters(gold, true);
+}
+
+TEST_F(ObuParserTest, FrameParameterShowExistingFrameTemporalPointInfo) {
+  BytesAndBits data;
+  data.AppendBit(1);                    // show_existing_frame.
+  data.AppendLiteral(3, kFrameToShow);  // frame_to_show.
+  data.AppendLiteral(20, 38);           // frame_presentation_time.
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.show_existing_frame = true;
+  gold.frame_to_show = kFrameToShow;
+  gold.frame_presentation_time = 38;
+
+  EXPECT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->frame_width_bits = kFrameWidthBits;
+  obu_sequence_header_->frame_height_bits = kFrameHeightBits;
+  obu_sequence_header_->max_frame_width = kWidth;
+  obu_sequence_header_->max_frame_height = kHeight;
+
+  obu_sequence_header_->decoder_model_info_present_flag = true;
+  obu_sequence_header_->decoder_model_info.frame_presentation_time_length = 20;
+
+  decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+  decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+
+  ASSERT_TRUE(ObuParseFrameParameters());
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterErrorResilientMode) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+  gold.error_resilient_mode = true;
+  data.SetBit(4, static_cast<uint8_t>(gold.error_resilient_mode));
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrame) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameTemporalPointInfo) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  data.InsertLiteral(4, 20, 38);  // frame_presentation_time.
+  gold.frame_presentation_time = 38;
+
+  EXPECT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->frame_width_bits = kFrameWidthBits;
+  obu_sequence_header_->frame_height_bits = kFrameHeightBits;
+  obu_sequence_header_->max_frame_width = kWidth;
+  obu_sequence_header_->max_frame_height = kHeight;
+
+  obu_sequence_header_->decoder_model_info_present_flag = true;
+  obu_sequence_header_->decoder_model_info.frame_presentation_time_length = 20;
+
+  ASSERT_TRUE(ObuParseFrameParameters());
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameOverrideSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  OverrideFrameSize(&data, &gold, 5, 6);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+
+  OverrideRenderSize(&data, &gold, 23);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameSuperRes) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.use_superres = true;
+  gold.superres_scale_denominator = 15;
+  gold.width = kWidth * 8 / 15;
+  gold.columns4x4 = 58;
+
+  data.SetBit(6, static_cast<int>(gold.use_superres));
+  data.SetLiteral(7, 3, gold.superres_scale_denominator - 9);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 0, 0, true));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameAllowScreenContentTools) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  data.InsertBit(5, 1);  // allow_screen_content_tools.
+  data.InsertBit(8, 1);  // allow_intrabc.
+  gold.allow_screen_content_tools = true;
+  gold.allow_intrabc = true;
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2));
+  VerifyFrameParameters(gold);
+
+  data.InsertBit(6, 1);  // force_integer_mv.
+  gold.force_integer_mv = 1;
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2, 2));
+  VerifyFrameParameters(gold);
+
+  data.SetBit(6, 0);  // force_integer_mv.
+
+  // Gold need not be updated, because force_integer_mv is always 1 for
+  // keyframes.
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2, 2));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrame) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrameOverrideSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+  OverrideFrameSize(&data, &gold, 6, 15);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+
+  OverrideRenderSize(&data, &gold, 32);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+// An INTRA_ONLY_FRAME cannot set refresh_frame_flags to 0xff.
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrameRefreshAllFrames) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  data.SetLiteral(7, 8, 0xFF);  // refresh_frame_flags.
+
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, FrameParameterInterFrame) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderInterFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameInter);
+  ObuFrameHeader reference_frame_header;
+  reference_frame_header.width = kWidth;
+  reference_frame_header.height = kHeight;
+  reference_frame_header.render_width = kWidth;
+  reference_frame_header.render_height = kHeight;
+  reference_frame_header.upscaled_width = kWidth;
+  reference_frame_header.rows4x4 = kRows4x4;
+  reference_frame_header.columns4x4 = kColumns4x4;
+  reference_frame_header.refresh_frame_flags = 0;
+  for (auto& reference_frame : decoder_state_.reference_frame) {
+    reference_frame = buffer_pool_->GetFreeBuffer();
+    EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+  }
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterInterFrameOverrideSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderInterFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameInter);
+  ObuFrameHeader reference_frame_header;
+  reference_frame_header.width = kWidth;
+  reference_frame_header.height = kHeight;
+  reference_frame_header.render_width = kWidth;
+  reference_frame_header.render_height = kHeight;
+  reference_frame_header.upscaled_width = kWidth;
+  reference_frame_header.rows4x4 = kRows4x4;
+  reference_frame_header.columns4x4 = kColumns4x4;
+  reference_frame_header.refresh_frame_flags = 0;
+  for (auto& reference_frame : decoder_state_.reference_frame) {
+    reference_frame = buffer_pool_->GetFreeBuffer();
+    EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+  }
+
+  data.InsertLiteral(39, kNumInterReferenceFrameTypes, 0);  // found_ref.
+  OverrideFrameSize(&data, &gold, 6, 46);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+
+  OverrideRenderSize(&data, &gold, 63);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+// This test verifies we check the following requirement at the end of Section
+// 6.8.4:
+//   If FrameIsIntra is equal to 0 (indicating that this frame may use inter
+//   prediction), the requirements described in the frame size with refs
+//   semantics of section 6.8.6 must also be satisfied.
+TEST_F(ObuParserTest, FrameParameterInterFrameInvalidSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderInterFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameInter);
+  ObuFrameHeader reference_frame_header;
+  reference_frame_header.width = kWidth;
+  reference_frame_header.height = 2 * kHeight + 8;
+  reference_frame_header.render_width = kWidth;
+  reference_frame_header.render_height = 2 * kHeight + 8;
+  reference_frame_header.upscaled_width = kWidth;
+  reference_frame_header.rows4x4 = 2 * kRows4x4 + 2;
+  reference_frame_header.columns4x4 = kColumns4x4;
+  reference_frame_header.refresh_frame_flags = 0;
+  for (auto& reference_frame : decoder_state_.reference_frame) {
+    reference_frame = buffer_pool_->GetFreeBuffer();
+    EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+  }
+
+  EXPECT_FALSE(ParseFrameParameters(data.GenerateData()));
+}
+
+// Tests the ObuParser::SetFrameReferences() method.
+//
+// This method uses the following data members as input:
+//   decoder_state_.reference_order_hint
+//   sequence_header_.enable_order_hint
+//   sequence_header_.order_hint_bits
+//   frame_header_.order_hint
+// So we need to set up these data members before calling
+// ObuParser::SetFrameReferences().
+//
+// The output is in frame_header_.reference_frame_index.
+TEST_F(ObuParserTest, SetFrameReferences) {
+  // All reference frames are forward references (because 9 < 17).
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    decoder_state_.reference_order_hint[i] = 9;
+  }
+
+  ASSERT_TRUE(Init());
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 5;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  obu_frame_header_->order_hint = 17;
+
+  const int8_t last_frame_idx = 0;
+  const int8_t gold_frame_idx = 1;
+
+  // Since all reference frames are forward references, we set the remaining
+  // five references in reverse chronological order. So Last2, Last3, Backward,
+  // Alternate2, and Alternate are set to 7, 6, 5, 4, and 3, respectively.
+
+  EXPECT_TRUE(ObuSetFrameReferences(last_frame_idx, gold_frame_idx));
+
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameLast - kReferenceFrameLast],
+      0);
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameLast2 - kReferenceFrameLast],
+      7);
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameLast3 - kReferenceFrameLast],
+      6);
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast],
+      1);
+  EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameBackward -
+                                                     kReferenceFrameLast],
+            5);
+  EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameAlternate2 -
+                                                     kReferenceFrameLast],
+            4);
+  EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameAlternate -
+                                                     kReferenceFrameLast],
+            3);
+}
+
+TEST_F(ObuParserTest, LoopFilterParameters) {
+  LoopFilter gold;
+  memset(&gold, 0, sizeof(gold));
+
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  obu_frame_header_->coded_lossless = true;
+  gold.ref_deltas[kReferenceFrameIntra] = 1;
+  gold.ref_deltas[kReferenceFrameGolden] = -1;
+  gold.ref_deltas[kReferenceFrameAlternate] = -1;
+  gold.ref_deltas[kReferenceFrameAlternate2] = -1;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  obu_frame_header_->allow_intrabc = true;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+
+  gold.level[0] = 32;
+  gold.level[3] = 48;
+  gold.sharpness = 4;
+  data.Clear();
+  for (const auto& level : gold.level) {
+    data.AppendLiteral(6, level);
+  }
+  data.AppendLiteral(3, gold.sharpness);
+  data.AppendBit(0);  // delta_enabled.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+
+  gold.delta_enabled = true;
+  gold.delta_update = true;
+  gold.ref_deltas[0] = 20;
+  gold.mode_deltas[0] = -20;
+  data.SetBit(27, 1);  // delta_enabled.
+  data.AppendBit(1);   // delta_update.
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    if (i == 0) {
+      data.AppendBit(1);  // update_ref_delta.
+      data.AppendInverseSignedLiteral(6, gold.ref_deltas[0]);  // ref_delta.
+    } else {
+      data.AppendBit(0);  // update_ref_delta.
+    }
+  }
+  for (int i = 0; i < kLoopFilterMaxModeDeltas; ++i) {
+    if (i == 0) {
+      data.AppendBit(1);  // update_mode_delta.
+      data.AppendInverseSignedLiteral(6, gold.mode_deltas[0]);  // mode_delta.
+    } else {
+      data.AppendBit(0);  // update_mode_delta.
+    }
+  }
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParameters) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendLiteral(3, 0);  // delta_coded.
+  data.AppendBit(0);         // use_matrix.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersMonochrome) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendBit(0);  // delta_coded.
+  data.AppendBit(0);  // use_matrix.
+  // The quantizer parameters end here. Add a 1 bit. It should not be parsed.
+  data.AppendBit(1);  // Would be segmentation_enabled in a bitstream.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.is_monochrome = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersDeltaCoded) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+  gold.delta_dc[kPlaneY] = -30;
+
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendBit(1);  // delta_coded.
+  data.AppendInverseSignedLiteral(6, gold.delta_dc[kPlaneY]);
+  data.AppendLiteral(2, 0);  // delta_coded u dc/ac.
+  data.AppendBit(0);         // use_matrix.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_dc[kPlaneU] = -40;
+  gold.delta_dc[kPlaneV] = gold.delta_dc[kPlaneU];
+  data.SetBit(16, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(17, 6, gold.delta_dc[kPlaneU]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_ac[kPlaneU] = 50;
+  gold.delta_ac[kPlaneV] = gold.delta_ac[kPlaneU];
+  data.SetBit(24, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(25, 6, gold.delta_ac[kPlaneU]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_dc[kPlaneV] = 60;
+  gold.delta_ac[kPlaneV] = 0;
+  data.InsertBit(16, 1);  // diff_uv_delta.
+  data.InsertBit(33, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(34, 6, gold.delta_dc[kPlaneV]);
+  data.InsertBit(41, 0);  // delta_coded.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_ac[kPlaneV] = -20;
+  data.SetBit(41, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(42, 6, gold.delta_ac[kPlaneV]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersUseQmatrix) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+  gold.use_matrix = true;
+  gold.matrix_level[kPlaneY] = 3;
+  gold.matrix_level[kPlaneU] = 6;
+  gold.matrix_level[kPlaneV] = gold.matrix_level[kPlaneU];
+
+  // Test three cases.
+  // 1. separate_uv_delta_q = false (which implies diff_uv_delta = false).
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendLiteral(3, 0);  // delta_coded.
+  data.AppendBit(static_cast<uint8_t>(gold.use_matrix));
+  data.AppendLiteral(4, gold.matrix_level[kPlaneY]);
+  data.AppendLiteral(4, gold.matrix_level[kPlaneU]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  // 2. separate_uv_delta_q = true and diff_uv_delta = false.
+  gold.matrix_level[kPlaneV] = 5;
+  data.InsertBit(9, 0);  // diff_uv_delta.
+  data.AppendLiteral(4, gold.matrix_level[kPlaneV]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  // 3. separate_uv_delta_q = true and diff_uv_delta = true.
+  data.SetBit(9, 1);             // diff_uv_delta.
+  data.InsertLiteral(12, 2, 0);  // delta_coded.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, SegmentationParameters) {
+  const int kPrimaryReferenceNotNone = 1;
+  const int kPrevFrameIndexNotNone = 2;
+
+  // Set up decoder_state_ with a previous frame containing saved segmentation
+  // parameters.
+  decoder_state_.reference_frame[kPrevFrameIndexNotNone] =
+      buffer_pool_->GetFreeBuffer();
+  ASSERT_NE(decoder_state_.reference_frame[kPrevFrameIndexNotNone], nullptr);
+  Segmentation prev_segmentation = {};
+  prev_segmentation.feature_enabled[2][0] = true;
+  prev_segmentation.feature_enabled[5][0] = true;
+  prev_segmentation.last_active_segment_id = 5;
+  decoder_state_.reference_frame[kPrevFrameIndexNotNone]
+      ->SetSegmentationParameters(prev_segmentation);
+
+  Segmentation gold;
+  memset(&gold, 0, sizeof(gold));
+
+  BytesAndBits data;
+  data.AppendBit(0);  // segmentation_enabled.
+
+  // Since segmentation_enabled is false, we expect the parameters to be all
+  // zero/false.
+  ASSERT_TRUE(ParseSegmentationParameters(
+      data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+  VerifySegmentationParameters(gold);
+
+  gold.enabled = true;
+  gold.update_map = true;
+  gold.temporal_update = true;
+  data.SetBit(0, static_cast<uint8_t>(gold.enabled));
+  data.AppendBit(static_cast<uint8_t>(gold.update_map));
+  data.AppendBit(static_cast<uint8_t>(gold.temporal_update));
+  data.AppendBit(static_cast<uint8_t>(gold.update_data));
+
+  // Since update_data is false, we expect the parameters to be loaded from the
+  // previous frame in |decoder_state_|. So change |gold| accordingly.
+  gold.feature_enabled[2][0] = true;
+  gold.feature_enabled[5][0] = true;
+  gold.last_active_segment_id = 5;
+
+  ASSERT_TRUE(ParseSegmentationParameters(
+      data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+  VerifySegmentationParameters(gold);
+
+  OverrideSegmentation(&data, &gold, 3);
+
+  ASSERT_TRUE(ParseSegmentationParameters(
+      data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+  VerifySegmentationParameters(gold);
+
+  // If primary_ref_frame is kPrimaryReferenceNone, these three fields are
+  // implied.
+  data.RemoveBit(1);  // segmentation_update_map.
+  data.RemoveBit(1);  // segmentation_temporal_update.
+  data.RemoveBit(1);  // segmentation_update_data.
+  gold.update_map = true;
+  gold.temporal_update = false;
+  gold.update_data = true;
+
+  // Since update_data is true, we expect the parameters to be read from
+  // |data|.
+  ASSERT_TRUE(ParseSegmentationParameters(data.GenerateData(),
+                                          kPrimaryReferenceNone, 0));
+  VerifySegmentationParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerIndexDeltaParameters) {
+  BytesAndBits data;
+  data.AppendBit(1);         // delta_q_present.
+  data.AppendLiteral(2, 2);  // delta_q_res.
+
+  Delta gold;
+  memset(&gold, 0, sizeof(gold));
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerIndexDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_q);
+
+  gold.present = true;
+  gold.scale = 2;
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->quantizer.base_index = 40;
+  ASSERT_TRUE(ObuParseQuantizerIndexDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_q);
+}
+
+TEST_F(ObuParserTest, LoopFilterDeltaParameters) {
+  BytesAndBits data;
+  data.AppendBit(1);         // delta_lf_present.
+  data.AppendLiteral(2, 2);  // delta_lf_res.
+  data.AppendBit(1);         // delta_lf_multi.
+
+  Delta gold;
+  memset(&gold, 0, sizeof(gold));
+
+  // delta_q_present is false, so loop filter delta will not be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+
+  // allow_intrabc is true, so loop filter delta will not be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->delta_q.present = true;
+  obu_frame_header_->allow_intrabc = true;
+  ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+
+  gold.present = true;
+  gold.scale = 2;
+  gold.multi = true;
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->delta_q.present = true;
+  ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+}
+
+TEST_F(ObuParserTest, ComputeSegmentLosslessAndQIndex) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+
+  // Segmentation is disabled. All quantizers are 0.
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_TRUE(obu_->frame_header().coded_lossless);
+  EXPECT_TRUE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation is enabled. All quantizers are zero.
+  obu_frame_header_->segmentation.enabled = true;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_TRUE(obu_->frame_header().coded_lossless);
+  EXPECT_TRUE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation is enabled. All quantizers are zero. upscaled_width != width.
+  obu_frame_header_->segmentation.enabled = true;
+  obu_frame_header_->upscaled_width = 100;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_TRUE(obu_->frame_header().coded_lossless);
+  EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation in disabled. Some quantizer deltas are non zero.
+  obu_frame_header_->segmentation.enabled = false;
+  obu_frame_header_->quantizer.delta_dc[kPlaneY] = 40;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_FALSE(obu_->frame_header().coded_lossless);
+  EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation is disabled. Quantizer base index is non zero.
+  obu_frame_header_->segmentation.enabled = true;
+  obu_frame_header_->quantizer.delta_dc[kPlaneY] = 0;
+  obu_frame_header_->quantizer.base_index = 40;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_FALSE(obu_->frame_header().coded_lossless);
+  EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 40);
+  }
+}
+
+TEST_F(ObuParserTest, CdefParameters) {
+  Cdef gold;
+  memset(&gold, 0, sizeof(gold));
+  const int coeff_shift = 2;  // bitdepth - 8.
+  gold.damping = 3 + coeff_shift;
+
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.bitdepth = 10;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  // Cdef will be {0} except for damping because enable_cdef is false.
+  VerifyCdefParameters(gold);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->enable_cdef = true;
+  obu_sequence_header_->color_config.bitdepth = 10;
+  obu_frame_header_->coded_lossless = true;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  // Cdef will be {0} except for damping because coded_lossless is true.
+  VerifyCdefParameters(gold);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->enable_cdef = true;
+  obu_sequence_header_->color_config.bitdepth = 10;
+  obu_frame_header_->allow_intrabc = true;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  // Cdef will be {0} except for damping because allow_intrabc is true.
+  VerifyCdefParameters(gold);
+
+  gold.damping = 5;
+  gold.bits = 1;
+  data.Clear();
+  data.AppendLiteral(2, gold.damping - 3);  // cdef_damping_minus3.
+  gold.damping += coeff_shift;
+  data.AppendLiteral(2, gold.bits);  // cdef_bits.
+  for (int i = 0; i < 2; ++i) {
+    gold.y_primary_strength[i] = 10;
+    gold.y_secondary_strength[i] = (i == 0) ? 2 : 3;
+    gold.uv_primary_strength[i] = 12;
+    gold.uv_secondary_strength[i] = (i == 1) ? 2 : 3;
+    data.AppendLiteral(4, gold.y_primary_strength[i]);
+    data.AppendLiteral(2, gold.y_secondary_strength[i]);
+    data.AppendLiteral(4, gold.uv_primary_strength[i]);
+    data.AppendLiteral(2, gold.uv_secondary_strength[i]);
+    if (gold.y_secondary_strength[i] == 3) ++gold.y_secondary_strength[i];
+    if (gold.uv_secondary_strength[i] == 3) ++gold.uv_secondary_strength[i];
+    gold.y_primary_strength[i] <<= coeff_shift;
+    gold.uv_primary_strength[i] <<= coeff_shift;
+    gold.y_secondary_strength[i] <<= coeff_shift;
+    gold.uv_secondary_strength[i] <<= coeff_shift;
+  }
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->enable_cdef = true;
+  obu_sequence_header_->color_config.bitdepth = 10;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  VerifyCdefParameters(gold);
+}
+
+TEST_F(ObuParserTest, LoopRestorationParameters) {
+  for (bool use_128x128_superblock : testing::Bool()) {
+    SCOPED_TRACE("use_128x128_superblock: " +
+                 std::to_string(use_128x128_superblock));
+    LoopRestoration gold;
+    memset(&gold, 0, sizeof(gold));
+
+    BytesAndBits data;
+    data.AppendBit(0);  // dummy.
+
+    // enable_restoration is false. nothing will be read.
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_frame_header_->allow_intrabc = true;
+    obu_frame_header_->coded_lossless = true;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    // allow_intrabc is true. nothing will be read.
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_frame_header_->allow_intrabc = true;
+    obu_sequence_header_->enable_restoration = true;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    // coded_lossless is true. nothing will be read.
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_frame_header_->coded_lossless = true;
+    obu_sequence_header_->enable_restoration = true;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    data.Clear();
+    for (int i = 0; i < kMaxPlanes; ++i) {
+      data.AppendLiteral(2, kLoopRestorationTypeNone);  // lr_type.
+    }
+
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_sequence_header_->enable_restoration = true;
+    obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    gold.type[0] = gold.type[1] = kLoopRestorationTypeWiener;
+    gold.unit_size_log2[0] = gold.unit_size_log2[1] = gold.unit_size_log2[2] =
+        use_128x128_superblock ? 8 : 7;
+    data.SetLiteral(0, 2, gold.type[0]);  // lr_type.
+    data.SetLiteral(2, 2, gold.type[0]);  // lr_type.
+    data.AppendBit(1);                    // lr_unit_shift.
+    if (!use_128x128_superblock) {
+      data.AppendBit(0);  // lr_unit_extra_shift.
+    }
+
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_sequence_header_->enable_restoration = true;
+    obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    if (!use_128x128_superblock) {
+      gold.unit_size_log2[0] = gold.unit_size_log2[1] = gold.unit_size_log2[2] =
+          8;
+      data.SetBit(7, 1);  // lr_unit_extra_shift.
+
+      ASSERT_TRUE(Init(data.GenerateData()));
+      obu_sequence_header_->enable_restoration = true;
+      obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+      ASSERT_TRUE(ObuParseLoopRestorationParameters());
+      VerifyLoopRestorationParameters(gold);
+    }
+
+    gold.unit_size_log2[1] = gold.unit_size_log2[2] = 7;
+    data.AppendBit(1);  // lr_uv_shift.
+
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_sequence_header_->enable_restoration = true;
+    obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+    obu_sequence_header_->color_config.subsampling_x = 1;
+    obu_sequence_header_->color_config.subsampling_y = 1;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+  }
+}
+
+TEST_F(ObuParserTest, TxModeSyntax) {
+  BytesAndBits data;
+  data.AppendBit(1);  // tx_mode_select.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseTxModeSyntax());
+  EXPECT_EQ(kTxModeSelect, obu_->frame_header().tx_mode);
+
+  data.SetBit(0, 0);  // tx_mode_select.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseTxModeSyntax());
+  EXPECT_EQ(kTxModeLargest, obu_->frame_header().tx_mode);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->coded_lossless = true;
+  ASSERT_TRUE(ObuParseTxModeSyntax());
+  EXPECT_EQ(kTxModeOnly4x4, obu_->frame_header().tx_mode);
+}
+
+TEST_F(ObuParserTest, FrameReferenceModeSyntax) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(ParseFrameReferenceModeSyntax(data.GenerateData(), kFrameKey));
+  EXPECT_FALSE(obu_->frame_header().reference_mode_select);
+
+  data.SetBit(0, 1);  // reference_mode_select.
+
+  ASSERT_TRUE(ParseFrameReferenceModeSyntax(data.GenerateData(), kFrameInter));
+  EXPECT_TRUE(obu_->frame_header().reference_mode_select);
+}
+
+TEST_F(ObuParserTest, SkipModeParameters) {
+  BytesAndBits data;
+  data.AppendBit(1);  // skip_mode_present.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameKey;
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 7;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  obu_frame_header_->order_hint = 1;
+  decoder_state_.order_hint = 1;
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 7;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    obu_frame_header_->reference_frame_index[i] = i;
+    decoder_state_.reference_order_hint[i] = i;
+  }
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  obu_frame_header_->order_hint = 1;
+  decoder_state_.order_hint = 1;
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 7;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  ASSERT_TRUE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_TRUE(obu_->frame_header().skip_mode_present);
+}
+
+TEST_F(ObuParserTest, AllowWarpedMotion) {
+  BytesAndBits data;
+  data.AppendBit(0xff);  // dummy.
+
+  // IsIntraFrame is true, so nothing will be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameKey;
+  obu_frame_header_->error_resilient_mode = false;
+  obu_sequence_header_->enable_warped_motion = true;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+  // error_resilient_mode is true, so nothing will be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->error_resilient_mode = true;
+  obu_sequence_header_->enable_warped_motion = true;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+  // enable_warped_motion is false, so nothing will be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->error_resilient_mode = false;
+  obu_sequence_header_->enable_warped_motion = false;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+  // allow_warped_motion will be read and equal to true.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->error_resilient_mode = false;
+  obu_sequence_header_->enable_warped_motion = true;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_TRUE(obu_->frame_header().allow_warped_motion);
+}
+
+TEST_F(ObuParserTest, GlobalMotionParameters) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> gold;
+  for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+    gold[i].type = kGlobalMotionTransformationTypeIdentity;
+    for (int j = 0; j < 6; ++j) {
+      gold[i].params[j] = (j % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+    }
+  }
+
+  ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameKey));
+  VerifyGlobalMotionParameters(gold);
+
+  data.Clear();
+  for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+    // is_global=1; is_rot_zoom=1; parameter_values;
+    data.AppendBytes(kDefaultGlobalMotionParametersRotZoom);
+
+    // Magic numbers based on kDefaultGlobalMotionParametersRotZoom.
+    gold[i].type = kGlobalMotionTransformationTypeRotZoom;
+    gold[i].params[0] = -73728;
+    gold[i].params[1] = -23552;
+    gold[i].params[2] = 65952;
+    gold[i].params[3] = -62;
+    gold[i].params[4] = 62;
+    gold[i].params[5] = 65952;
+  }
+
+  ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameInter));
+  VerifyGlobalMotionParameters(gold);
+
+  data.Clear();
+  for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+    // This bit is not part of the hex string because it would make the whole
+    // string not align to 8 bits. Appending this separately so that we can keep
+    // the rest of them a magic hex string.
+    data.AppendBit(1);  // is_global.
+    // is_rot_zoom=0; is_translation=0; parameter_values;
+    data.AppendBytes(kDefaultGlobalMotionParametersAffine);
+
+    // Magic numbers based on kDefaultGlobalMotionParametersAffine.
+    gold[i].type = kGlobalMotionTransformationTypeAffine;
+    gold[i].params[4] = -62;
+  }
+
+  ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameInter));
+  VerifyGlobalMotionParameters(gold);
+}
+
+TEST_F(ObuParserTest, FilmGrainParameters) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  // Test film grain not present.
+  FilmGrainParams gold = {};
+  ObuSequenceHeader sequence_header = {};
+  sequence_header.film_grain_params_present = false;
+  ObuFrameHeader frame_header = {};
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if show_frame = false and showable_frame = false.
+  data.Clear();
+  gold = {};
+  sequence_header.film_grain_params_present = true;
+  frame_header.show_frame = false;
+  frame_header.showable_frame = false;
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if apply_grain = false.
+  data.Clear();
+  gold = {};
+  sequence_header.film_grain_params_present = true;
+  frame_header.show_frame = true;
+  frame_header.showable_frame = true;
+  data.AppendBit(0);
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if update_grain = false.
+  data.Clear();
+  gold = {};
+  sequence_header.film_grain_params_present = true;
+  frame_header.show_frame = true;
+  frame_header.showable_frame = true;
+  frame_header.frame_type = kFrameInter;
+  for (auto& index : frame_header.reference_frame_index) {
+    index = 1;
+  }
+  data.AppendBit(1);
+  gold.apply_grain = true;
+  data.AppendLiteral(16, 8);
+  gold.grain_seed = 8;
+  data.AppendBit(0);
+  gold.update_grain = false;
+  data.AppendLiteral(3, 1);
+  gold.reference_index = 1;
+  // Set up decoder_state_ with a previous frame containing saved film grain
+  // parameters.
+  decoder_state_.reference_frame[1] = buffer_pool_->GetFreeBuffer();
+  EXPECT_NE(decoder_state_.reference_frame[1], nullptr);
+  FilmGrainParams prev_grain_params = {};
+  prev_grain_params.apply_grain = true;
+  prev_grain_params.grain_seed = 11;
+  prev_grain_params.update_grain = true;
+  decoder_state_.reference_frame[1]->set_film_grain_params(prev_grain_params);
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if update_grain = true, is_monochrome = true;
+  data.Clear();
+  gold = {};
+  frame_header.frame_type = kFrameKey;
+  for (auto& index : frame_header.reference_frame_index) {
+    index = 0;
+  }
+  data.AppendBit(1);
+  gold.apply_grain = true;
+  data.AppendLiteral(16, 8);
+  gold.grain_seed = 8;
+  gold.update_grain = true;
+  data.AppendLiteral(4, 10);
+  gold.num_y_points = 10;
+  for (int i = 0; i < gold.num_y_points; ++i) {
+    data.AppendLiteral(8, 2 * i);
+    gold.point_y_value[i] = 2 * i;
+    data.AppendLiteral(8, i);
+    gold.point_y_scaling[i] = i;
+  }
+  sequence_header.color_config.is_monochrome = true;
+  gold.chroma_scaling_from_luma = false;
+  gold.num_u_points = 0;
+  gold.num_v_points = 0;
+  data.AppendLiteral(2, 3);
+  gold.chroma_scaling = 11;
+  data.AppendLiteral(2, 1);
+  gold.auto_regression_coeff_lag = 1;
+  const int num_pos_luma =
+      2 * gold.auto_regression_coeff_lag * (gold.auto_regression_coeff_lag + 1);
+  for (int i = 0; i < num_pos_luma; ++i) {
+    data.AppendLiteral(8, i + 128);
+    gold.auto_regression_coeff_y[i] = i;
+  }
+  data.AppendLiteral(2, 0);
+  gold.auto_regression_shift = 6;
+  data.AppendLiteral(2, 1);
+  gold.grain_scale_shift = 1;
+  data.AppendBit(1);
+  gold.overlap_flag = true;
+  data.AppendBit(0);
+  gold.clip_to_restricted_range = false;
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  ASSERT_TRUE(
+      obu_->frame_header().frame_type == kFrameInter ||
+      obu_->frame_header().film_grain_params.update_grain);  // a implies b.
+  VerifyFilmGrainParameters(gold);
+
+  // Test if update_grain = true, is_monochrome = false;
+  data.Clear();
+  gold = {};
+  frame_header.frame_type = kFrameKey;
+  data.AppendBit(1);
+  gold.apply_grain = true;
+  data.AppendLiteral(16, 8);
+  gold.grain_seed = 8;
+  gold.update_grain = true;
+  data.AppendLiteral(4, 10);
+  gold.num_y_points = 10;
+  for (int i = 0; i < gold.num_y_points; ++i) {
+    data.AppendLiteral(8, 2 * i);
+    gold.point_y_value[i] = 2 * i;
+    data.AppendLiteral(8, i);
+    gold.point_y_scaling[i] = i;
+  }
+  sequence_header.color_config.is_monochrome = false;
+  data.AppendBit(0);
+  gold.chroma_scaling_from_luma = false;
+  data.AppendLiteral(4, 5);
+  gold.num_u_points = 5;
+  for (int i = 0; i < gold.num_u_points; ++i) {
+    data.AppendLiteral(8, 2 * i + 1);
+    gold.point_u_value[i] = 2 * i + 1;
+    data.AppendLiteral(8, i);
+    gold.point_u_scaling[i] = i;
+  }
+  data.AppendLiteral(4, 3);
+  gold.num_v_points = 3;
+  for (int i = 0; i < gold.num_v_points; ++i) {
+    data.AppendLiteral(8, i);
+    gold.point_v_value[i] = i;
+    data.AppendLiteral(8, i + 1);
+    gold.point_v_scaling[i] = i + 1;
+  }
+  data.AppendLiteral(2, 3);
+  gold.chroma_scaling = 11;
+  data.AppendLiteral(2, 1);
+  gold.auto_regression_coeff_lag = 1;
+  const int num_pos_luma2 =
+      2 * gold.auto_regression_coeff_lag * (gold.auto_regression_coeff_lag + 1);
+  for (int i = 0; i < num_pos_luma2; ++i) {
+    data.AppendLiteral(8, i + 128);
+    gold.auto_regression_coeff_y[i] = i;
+  }
+  for (int i = 0; i < num_pos_luma2 + 1; ++i) {
+    data.AppendLiteral(8, i);
+    gold.auto_regression_coeff_u[i] = i - 128;
+  }
+  for (int i = 0; i < num_pos_luma2 + 1; ++i) {
+    data.AppendLiteral(8, i);
+    gold.auto_regression_coeff_v[i] = i - 128;
+  }
+  data.AppendLiteral(2, 0);
+  gold.auto_regression_shift = 6;
+  data.AppendLiteral(2, 1);
+  gold.grain_scale_shift = 1;
+  data.AppendLiteral(8, 2);
+  gold.u_multiplier = -126;
+  data.AppendLiteral(8, 1);
+  gold.u_luma_multiplier = -127;
+  data.AppendLiteral(9, 3);
+  gold.u_offset = -253;
+  data.AppendLiteral(8, 3);
+  gold.v_multiplier = -125;
+  data.AppendLiteral(8, 2);
+  gold.v_luma_multiplier = -126;
+  data.AppendLiteral(9, 1);
+  gold.v_offset = -255;
+  data.AppendBit(1);
+  gold.overlap_flag = true;
+  data.AppendBit(0);
+  gold.clip_to_restricted_range = false;
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  ASSERT_TRUE(
+      obu_->frame_header().frame_type == kFrameInter ||
+      obu_->frame_header().film_grain_params.update_grain);  // a implies b.
+  VerifyFilmGrainParameters(gold);
+}
+
+TEST_F(ObuParserTest, TileInfoSyntax) {
+  BytesAndBits data;
+  TileInfo gold;
+  memset(&gold, 0, sizeof(gold));
+
+  gold.uniform_spacing = true;
+  gold.tile_columns_log2 = 1;
+  gold.tile_columns = 2;
+  gold.tile_rows_log2 = 1;
+  gold.tile_rows = 2;
+  gold.tile_count = 4;
+  gold.tile_column_start[1] = 64;
+  gold.tile_column_start[2] = 88;
+  gold.tile_row_start[1] = 64;
+  gold.tile_row_start[2] = 72;
+  gold.context_update_id = 3;
+  gold.tile_size_bytes = 4;
+  data.AppendBit(static_cast<uint8_t>(gold.uniform_spacing));
+  data.AppendBit(1);  // increment_tile_cols_log2.
+  data.AppendBit(0);  // increment_tile_cols_log2.
+  data.AppendBit(1);  // increment_tile_rows_log2.
+  data.AppendBit(0);  // increment_tile_rows_log2.
+  data.AppendBit(1);  // context update id, columns_log2+rows_log2 bits
+  data.AppendBit(1);
+  data.AppendLiteral(2, gold.tile_size_bytes - 1);
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+  VerifyTileInfoParameters(gold);
+
+  gold.uniform_spacing = false;
+  gold.tile_column_width_in_superblocks[0] = 2;
+  gold.tile_column_width_in_superblocks[1] = 1;
+  gold.tile_row_height_in_superblocks[0] = 2;
+  gold.tile_row_height_in_superblocks[1] = 1;
+
+  data.SetBit(0, static_cast<uint8_t>(gold.uniform_spacing));
+  // The next 4 bits remain the same except now they represent f(w - 1) and
+  // extra_bit in DecodeUniform. All the subsequent bits are unchanged the
+  // represent the same thing as above.
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+  VerifyTileInfoParameters(gold);
+
+  // No tiles.
+  memset(&gold, 0, sizeof(gold));
+  gold.uniform_spacing = true;
+  gold.tile_columns = 1;
+  gold.tile_rows = 1;
+  gold.tile_count = 1;
+  gold.tile_column_start[1] = 88;
+  gold.tile_row_start[1] = 72;
+  data.Clear();
+  data.AppendBit(static_cast<uint8_t>(gold.uniform_spacing));
+  data.AppendBit(0);  // tile_cols_log2.
+  data.AppendBit(0);  // tile_rows_log2.
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+  VerifyTileInfoParameters(gold);
+
+  // 64x64 superblocks. No tiles.
+  gold.tile_column_start[1] = 640;
+  gold.tile_row_start[1] = 360;
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 640, 360, false));
+  VerifyTileInfoParameters(gold);
+}
+
+TEST_F(ObuParserTest, MetadataUnknownType) {
+  BytesAndBits data;
+  // The metadata_type 10 is a user private value (6-31).
+  data.AppendLiteral(8, 10);  // metadata_type.
+  // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU if
+  // they do not understand the metadata_type."
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataHdrCll) {
+  BytesAndBits data;
+  ObuMetadataHdrCll gold;
+  gold.max_cll = 25;
+  gold.max_fall = 100;
+
+  data.AppendLiteral(8, kMetadataTypeHdrContentLightLevel);
+  data.AppendLiteral(16, gold.max_cll);
+  data.AppendLiteral(16, gold.max_fall);
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadataHdrCll(gold);
+}
+
+TEST_F(ObuParserTest, MetadataHdrMdcv) {
+  BytesAndBits data;
+  ObuMetadataHdrMdcv gold;
+  for (int i = 0; i < 3; ++i) {
+    gold.primary_chromaticity_x[i] = 0;
+    gold.primary_chromaticity_y[i] = 0;
+  }
+  gold.white_point_chromaticity_x = 250;
+  gold.white_point_chromaticity_y = 2500;
+  gold.luminance_max = 6000;
+  gold.luminance_min = 3000;
+
+  data.AppendLiteral(8, kMetadataTypeHdrMasteringDisplayColorVolume);
+  for (int i = 0; i < 3; ++i) {
+    data.AppendLiteral(16, gold.primary_chromaticity_x[i]);
+    data.AppendLiteral(16, gold.primary_chromaticity_y[i]);
+  }
+  data.AppendLiteral(16, gold.white_point_chromaticity_x);
+  data.AppendLiteral(16, gold.white_point_chromaticity_y);
+  data.AppendLiteral(32, gold.luminance_max);
+  data.AppendLiteral(32, gold.luminance_min);
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadataHdrMdcv(gold);
+}
+
+TEST_F(ObuParserTest, MetadataScalability) {
+  BytesAndBits data;
+
+  data.AppendLiteral(8, kMetadataTypeScalability);
+  data.AppendLiteral(8, 0);  // scalability_mode_idc
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataItutT35) {
+  BytesAndBits data;
+  ObuMetadataItutT35 gold;
+  gold.country_code = 0xA6;  // 1 0 1 0 0 1 1 0 Switzerland
+  DynamicBuffer<uint8_t> payload_bytes;
+  ASSERT_TRUE(payload_bytes.Resize(10));
+  gold.payload_bytes = payload_bytes.get();
+  for (int i = 0; i < 10; ++i) {
+    gold.payload_bytes[i] = 9 - i;
+  }
+  gold.payload_size = 10;
+
+  data.AppendLiteral(8, kMetadataTypeItutT35);
+  data.AppendLiteral(8, gold.country_code);
+  for (int i = 0; i < 10; ++i) {
+    data.AppendLiteral(8, 9 - i);
+  }
+  // For the kMetadataTypeItutT35 metadata type, we must include the trailing
+  // bit so that the end of the itu_t_t35_payload_bytes can be identified.
+  data.AppendLiteral(8, 0x80);
+  data.AppendLiteral(8, 0x00);
+  data.AppendLiteral(8, 0x00);
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadataItutT35(gold);
+
+  gold.country_code = 0xFF;
+  gold.country_code_extension_byte = 10;
+
+  data.SetLiteral(8, 8, gold.country_code);
+  data.InsertLiteral(16, 8, gold.country_code_extension_byte);
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadataItutT35(gold);
+}
+
+TEST_F(ObuParserTest, MetadataTimecode) {
+  BytesAndBits data;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 59);  // seconds_value
+  data.AppendLiteral(6, 59);  // minutes_value
+  data.AppendLiteral(5, 23);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidSecondsValue) {
+  BytesAndBits data;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 60);  // seconds_value
+  data.AppendLiteral(6, 59);  // minutes_value
+  data.AppendLiteral(5, 23);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidMinutesValue) {
+  BytesAndBits data;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 59);  // seconds_value
+  data.AppendLiteral(6, 60);  // minutes_value
+  data.AppendLiteral(5, 23);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidHoursValue) {
+  BytesAndBits data;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 59);  // seconds_value
+  data.AppendLiteral(6, 59);  // minutes_value
+  data.AppendLiteral(5, 24);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter.h b/src/post_filter.h
new file mode 100644 (file)
index 0000000..a247075
--- /dev/null
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_POST_FILTER_H_
+#define LIBGAV1_SRC_POST_FILTER_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// This class applies in-loop filtering for each frame after it is
+// reconstructed. The in-loop filtering contains all post processing filtering
+// for the reconstructed frame, including deblock filter, CDEF, superres,
+// and loop restoration.
+// Historically, for example in libaom, loop filter refers to deblock filter.
+// To avoid name conflicts, we call this class PostFilter (post processing).
+// In-loop post filtering order is:
+// deblock --> CDEF --> super resolution--> loop restoration.
+// When CDEF and super resolution is not used, we can combine deblock
+// and restoration together to only filter frame buffer once.
+class PostFilter {
+ public:
+  // This class does not take ownership of the masks/restoration_info, but it
+  // may change their values.
+  //
+  // The overall flow of data in this class (for both single and multi-threaded
+  // cases) is as follows:
+  //   -> Input: |frame_buffer_|.
+  //   -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and
+  //      |loop_restoration_buffer_|.
+  //   -> Deblocking:
+  //      * Input: |source_buffer_|
+  //      * Output: |source_buffer_|
+  //   -> CDEF:
+  //      * Input: |source_buffer_|
+  //      * Output: |cdef_buffer_|
+  //   -> SuperRes:
+  //      * Input: |cdef_buffer_|
+  //      * Output: |superres_buffer_|
+  //   -> Loop Restoration:
+  //      * Input: |superres_buffer_|
+  //      * Output: |loop_restoration_buffer_|.
+  //   -> Now |frame_buffer_| contains the filtered frame.
+  PostFilter(const ObuFrameHeader& frame_header,
+             const ObuSequenceHeader& sequence_header,
+             FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer,
+             const dsp::Dsp* dsp, int do_post_filter_mask);
+
+  // non copyable/movable.
+  PostFilter(const PostFilter&) = delete;
+  PostFilter& operator=(const PostFilter&) = delete;
+  PostFilter(PostFilter&&) = delete;
+  PostFilter& operator=(PostFilter&&) = delete;
+
+  // The overall function that applies all post processing filtering with
+  // multiple threads.
+  // * The filtering order is:
+  //   deblock --> CDEF --> super resolution--> loop restoration.
+  // * The output of each filter is the input for the following filter. A
+  //   special case is that loop restoration needs a few rows of the deblocked
+  //   frame and the entire cdef filtered frame:
+  //   deblock --> CDEF --> super resolution --> loop restoration.
+  //              |                                 ^
+  //              |                                 |
+  //              -----------> super resolution -----
+  // * Any of these filters could be present or absent.
+  // * |frame_buffer_| points to the decoded frame buffer. When
+  //   ApplyFilteringThreaded() is called, |frame_buffer_| is modified by each
+  //   of the filters as described below.
+  // Filter behavior (multi-threaded):
+  // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+  //            If cdef and loop restoration are both on, then 4 rows (as
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
+  // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as
+  //         the input and the output is written into |cdef_buffer_| (which is
+  //         the same as |source_buffer_|).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and
+  //             |superres_line_buffer_| as the input and the output is written
+  //             into |superres_buffer_| (which is just |cdef_buffer_| with a
+  //             shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left).
+  void ApplyFilteringThreaded();
+
+  // Does the overall post processing filter for one superblock row starting at
+  // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter
+  // will not be applied.
+  //
+  // Filter behavior (single-threaded):
+  // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+  //            If cdef and loop restoration are both on, then 4 rows (as
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
+  // * Cdef: In-place filtering. The output is written into |cdef_buffer_|
+  //         (which is just |source_buffer_| with a shift to the top-left).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input
+  //             and the output is written into |superres_buffer_| (which is
+  //             just |cdef_buffer_| with a shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left or top-left).
+  // Returns the index of the last row whose post processing is complete and can
+  // be used for referencing.
+  int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
+                                        bool do_deblock);
+
+  // Apply deblocking filter in one direction (specified by |loop_filter_type|)
+  // for the superblock row starting at |row4x4_start| for columns starting from
+  // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling)
+  // until the smallest multiple of 16 that is >= |column4x4_end| or until
+  // |frame_header_.columns4x4|, whichever is lower. This function must be
+  // called only if |DoDeblock()| returns true.
+  void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start,
+                          int column4x4_start, int column4x4_end, int sb4x4);
+
+  static bool DoCdef(const ObuFrameHeader& frame_header,
+                     int do_post_filter_mask) {
+    return (frame_header.cdef.bits > 0 ||
+            frame_header.cdef.y_primary_strength[0] > 0 ||
+            frame_header.cdef.y_secondary_strength[0] > 0 ||
+            frame_header.cdef.uv_primary_strength[0] > 0 ||
+            frame_header.cdef.uv_secondary_strength[0] > 0) &&
+           (do_post_filter_mask & 0x02) != 0;
+  }
+  bool DoCdef() const { return do_cdef_; }
+  // If filter levels for Y plane (0 for vertical, 1 for horizontal),
+  // are all zero, deblock filter will not be applied.
+  static bool DoDeblock(const ObuFrameHeader& frame_header,
+                        uint8_t do_post_filter_mask) {
+    return (frame_header.loop_filter.level[0] > 0 ||
+            frame_header.loop_filter.level[1] > 0) &&
+           (do_post_filter_mask & 0x01) != 0;
+  }
+  bool DoDeblock() const { return do_deblock_; }
+
+  uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index,
+                                         ReferenceFrameType type,
+                                         int mode_id) const {
+    return deblock_filter_levels_[segment_id][level_index][type][mode_id];
+  }
+  // Computes the deblock filter levels using |delta_lf| and stores them in
+  // |deblock_filter_levels|.
+  void ComputeDeblockFilterLevels(
+      const int8_t delta_lf[kFrameLfCount],
+      uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+                                   [kNumReferenceFrameTypes][2]) const;
+  // Returns true if loop restoration will be performed for the given parameters
+  // and mask.
+  static bool DoRestoration(const LoopRestoration& loop_restoration,
+                            uint8_t do_post_filter_mask, int num_planes) {
+    if (num_planes == kMaxPlanesMonochrome) {
+      return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone &&
+             (do_post_filter_mask & 0x08) != 0;
+    }
+    return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) &&
+           (do_post_filter_mask & 0x08) != 0;
+  }
+  bool DoRestoration() const { return do_restoration_; }
+
+  // Returns a pointer to the unfiltered buffer. This is used by the Tile class
+  // to determine where to write the output of the tile decoding process taking
+  // in-place filtering offsets into consideration.
+  uint8_t* GetUnfilteredBuffer(int plane) { return source_buffer_[plane]; }
+  const YuvBuffer& frame_buffer() const { return frame_buffer_; }
+
+  // Returns true if SuperRes will be performed for the given frame header and
+  // mask.
+  static bool DoSuperRes(const ObuFrameHeader& frame_header,
+                         uint8_t do_post_filter_mask) {
+    return frame_header.width != frame_header.upscaled_width &&
+           (do_post_filter_mask & 0x04) != 0;
+  }
+  bool DoSuperRes() const { return do_superres_; }
+  LoopRestorationInfo* restoration_info() const { return restoration_info_; }
+  uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
+                           int row, int column) const {
+    return base_buffer + (row >> subsampling_y_[plane]) * stride +
+           ((column >> subsampling_x_[plane]) << pixel_size_log2_);
+  }
+  uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+  uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+  uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+
+  template <typename Pixel>
+  static void ExtendFrame(Pixel* frame_start, int width, int height,
+                          ptrdiff_t stride, int left, int right, int top,
+                          int bottom);
+
+ private:
+  // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member
+  // functions.
+  using DeblockFilter = void (PostFilter::*)(int row4x4_start, int row4x4_end,
+                                             int column4x4_start,
+                                             int column4x4_end);
+  // Functions common to all post filters.
+
+  // Extends the frame by setting the border pixel values to the one from its
+  // closest frame boundary.
+  void ExtendFrameBoundary(uint8_t* frame_start, int width, int height,
+                           ptrdiff_t stride, int left, int right, int top,
+                           int bottom) const;
+  // Extend frame boundary for referencing if the frame will be saved as a
+  // reference frame.
+  void ExtendBordersForReferenceFrame();
+  // Copies the deblocked pixels needed for loop restoration.
+  void CopyDeblockedPixels(Plane plane, int row4x4);
+  // Copies the border for one superblock row. If |for_loop_restoration| is
+  // true, then it assumes that the border extension is being performed for the
+  // input of the loop restoration process. If |for_loop_restoration| is false,
+  // then it assumes that the border extension is being performed for using the
+  // current frame as a reference frame. In this case, |progress_row_| is also
+  // updated.
+  void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+                                      bool for_loop_restoration);
+  // Sets up the |loop_restoration_border_| for loop restoration.
+  // This is called when there is no CDEF filter. We copy rows from
+  // |superres_buffer_| and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start);
+  // This is called when there is CDEF filter. We copy rows from
+  // |source_buffer_|, apply superres and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start, int sb4x4);
+  // Returns true if we can perform border extension in loop (i.e.) without
+  // waiting until the entire frame is decoded. If intra_block_copy is true, we
+  // do in-loop border extension only if the upscaled_width is the same as 4 *
+  // columns4x4. Otherwise, we cannot do in loop border extension since those
+  // pixels may be used by intra block copy.
+  bool DoBorderExtensionInLoop() const {
+    return !frame_header_.allow_intrabc ||
+           frame_header_.upscaled_width ==
+               MultiplyBy4(frame_header_.columns4x4);
+  }
+  template <typename Pixel>
+  void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height,
+                 Pixel* dst, ptrdiff_t dst_stride) {
+    assert(height > 0);
+    do {
+      memcpy(dst, src, width * sizeof(Pixel));
+      src += src_stride;
+      dst += dst_stride;
+    } while (--height != 0);
+  }
+
+  // Worker function used for multi-threaded implementation of Deblocking, CDEF
+  // and Loop Restoration.
+  using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic);
+  // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling
+  // thread and returns once all the jobs are completed.
+  void RunJobs(WorkerFunction worker);
+
+  // Functions for the Deblocking filter.
+
+  bool GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                          uint8_t* level, int* step,
+                                          int* filter_length) const;
+  void GetHorizontalDeblockFilterEdgeInfoUV(int row4x4, int column4x4,
+                                            uint8_t* level_u, uint8_t* level_v,
+                                            int* step,
+                                            int* filter_length) const;
+  bool GetVerticalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                        BlockParameters* const* bp_ptr,
+                                        uint8_t* level, int* step,
+                                        int* filter_length) const;
+  void GetVerticalDeblockFilterEdgeInfoUV(int column4x4,
+                                          BlockParameters* const* bp_ptr,
+                                          uint8_t* level_u, uint8_t* level_v,
+                                          int* step, int* filter_length) const;
+  void HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
+                               int column4x4_start, int column4x4_end);
+  void VerticalDeblockFilter(int row4x4_start, int row4x4_end,
+                             int column4x4_start, int column4x4_end);
+  // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct
+  // signature.
+  static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter),
+                             DeblockFilter>::value,
+                "");
+  static_assert(std::is_same<decltype(&PostFilter::VerticalDeblockFilter),
+                             DeblockFilter>::value,
+                "");
+  // Worker function used for multi-threaded deblocking.
+  template <LoopFilterType loop_filter_type>
+  void DeblockFilterWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>),
+          WorkerFunction>::value,
+      "");
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>),
+          WorkerFunction>::value,
+      "");
+
+  // Functions for the cdef filter.
+
+  // Copies the deblocked pixels necessary for use by the multi-threaded cdef
+  // implementation into |cdef_border_|.
+  void SetupCdefBorder(int row4x4);
+  // This function prepares the input source block for cdef filtering. The input
+  // source block contains a 12x12 block, with the inner 8x8 as the desired
+  // filter region. It pads the block if the 12x12 block includes out of frame
+  // pixels with a large value. This achieves the required behavior defined in
+  // section 5.11.52 of the spec.
+  template <typename Pixel>
+  void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
+                        int column4x4, uint16_t* cdef_source,
+                        ptrdiff_t cdef_stride, bool y_plane,
+                        const uint8_t border_columns[kMaxPlanes][256],
+                        bool use_border_columns);
+  // Applies cdef for one 64x64 block.
+  template <typename Pixel>
+  void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
+                           int block_height4x4, int row4x4_start,
+                           int column4x4_start,
+                           uint8_t border_columns[2][kMaxPlanes][256],
+                           bool use_border_columns[2][2]);
+  // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code
+  // duplication.
+  void ApplyCdefForOneSuperBlockRowHelper(
+      uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+      int row4x4, int block_height4x4);
+  // Applies CDEF filtering for the superblock row starting at |row4x4| with a
+  // height of 4*|sb4x4|.
+  void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row);
+  // Worker function used for multi-threaded CDEF.
+  void ApplyCdefWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker),
+                             WorkerFunction>::value,
+                "");
+
+  // Functions for the SuperRes filter.
+
+  // Applies super resolution for the |src| for |rows[plane]| rows of each
+  // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
+  // be processed, the line buffer indicated by |line_buffer_row| will be used
+  // as the source. If |dst_is_loop_restoration_border| is true, then it means
+  // that the |dst| pointers come from |loop_restoration_border_| and the
+  // strides will be populated from that buffer.
+  void ApplySuperRes(
+      const std::array<uint8_t*, kMaxPlanes>& src,
+      const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
+      const std::array<uint8_t*, kMaxPlanes>& dst,
+      bool dst_is_loop_restoration_border = false);  // Section 7.16.
+  // Applies SuperRes for the superblock row starting at |row4x4| with a height
+  // of 4*|sb4x4|.
+  void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
+                                        bool is_last_row);
+  void ApplySuperResThreaded();
+
+  // Functions for the Loop Restoration filter.
+
+  // Notes about Loop Restoration:
+  // (1). Loop restoration processing unit size is default to 64x64.
+  // Only when the remaining filtering area is smaller than 64x64, the
+  // processing unit size is the actual area size.
+  // For U/V plane, it is (64 >> subsampling_x) x (64 >> subsampling_y).
+  // (2). Loop restoration unit size can be 64x64, 128x128, 256x256 for Y
+  // plane. The unit size for chroma can be the same or half, depending on
+  // subsampling. If either subsampling_x or subsampling_y is one, unit size
+  // is halved on both x and y sides.
+  // All loop restoration units have the same size for one plane.
+  // One loop restoration unit could contain multiple processing units.
+  // But they share the same sets of loop restoration parameters.
+  // (3). Loop restoration has a row offset, kRestorationUnitOffset = 8. The
+  // size of first row of loop restoration units and processing units is
+  // shrunk by the offset.
+  // (4). Loop restoration units wrap the bottom and the right of the frame,
+  // if the remaining area is small. The criteria is whether the number of
+  // remaining rows/columns is smaller than half of loop restoration unit
+  // size.
+  // For example, if the frame size is 140x140, loop restoration unit size is
+  // 128x128. The size of the first loop restoration unit is 128x(128-8) =
+  // 128 columns x 120 rows.
+  // Since 140 - 120 < 128/2. The remaining 20 rows will be folded to the loop
+  // restoration unit. Similarly, the remaining 12 columns will also be folded
+  // to current loop restoration unit. So, even frame size is 140x140,
+  // there's only one loop restoration unit. Suppose processing unit is 64x64,
+  // then sizes of the first row of processing units are 64x56, 64x56, 12x56,
+  // respectively. The second row is 64x64, 64x64, 12x64.
+  // The third row is 64x20, 64x20, 12x20.
+
+  // |stride| is shared by |src_buffer| and |dst_buffer|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride,
+                                     Plane plane, int plane_height,
+                                     int plane_width, int y, int unit_row,
+                                     int current_process_unit_height,
+                                     int plane_unit_size, Pixel* dst_buffer);
+  // Applies loop restoration for the superblock row starting at |row4x4_start|
+  // with a height of 4*|sb4x4|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4);
+  // Helper function that calls the right variant of
+  // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth.
+  void ApplyLoopRestoration(int row4x4_start, int sb4x4);
+  // Worker function used for multithreaded Loop Restoration.
+  void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker),
+                             WorkerFunction>::value,
+                "");
+
+  // The lookup table for picking the deblock filter, according to deblock
+  // filter type.
+  const DeblockFilter deblock_filter_func_[2] = {
+      &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter};
+  const ObuFrameHeader& frame_header_;
+  const LoopRestoration& loop_restoration_;
+  const dsp::Dsp& dsp_;
+  const int8_t bitdepth_;
+  const int8_t subsampling_x_[kMaxPlanes];
+  const int8_t subsampling_y_[kMaxPlanes];
+  const int8_t planes_;
+  const int pixel_size_log2_;
+  const uint8_t* const inner_thresh_;
+  const uint8_t* const outer_thresh_;
+  const bool needs_chroma_deblock_;
+  const bool do_cdef_;
+  const bool do_deblock_;
+  const bool do_restoration_;
+  const bool do_superres_;
+  // This stores the deblocking filter levels assuming that the delta is zero.
+  // This will be used by all superblocks whose delta is zero (without having to
+  // recompute them). The dimensions (in order) are: segment_id, level_index
+  // (based on plane and direction), reference_frame and mode_id.
+  uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+                                [kNumReferenceFrameTypes][2];
+  // Stores the SuperRes info for the frame.
+  struct {
+    int upscaled_width;
+    int initial_subpixel_x;
+    int step;
+  } super_res_info_[kMaxPlanes];
+  const Array2D<int8_t>& cdef_index_;
+  const Array2D<uint8_t>& cdef_skip_;
+  const Array2D<TransformSize>& inter_transform_sizes_;
+  LoopRestorationInfo* const restoration_info_;
+  uint8_t* const superres_coefficients_[kNumPlaneTypes];
+  // Line buffer used by multi-threaded ApplySuperRes().
+  // In the multi-threaded case, this buffer will store the last downscaled row
+  // input of each thread to avoid overwrites by the first upscaled row output
+  // of the thread below it.
+  YuvBuffer& superres_line_buffer_;
+  const BlockParametersHolder& block_parameters_;
+  // Frame buffer to hold cdef filtered frame.
+  YuvBuffer cdef_filtered_buffer_;
+  // Input frame buffer.
+  YuvBuffer& frame_buffer_;
+  // A view into |frame_buffer_| that points to the input and output of the
+  // deblocking process.
+  uint8_t* source_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the output of the CDEF filtered
+  // planes (to facilitate in-place CDEF filtering).
+  uint8_t* cdef_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the planes after the SuperRes
+  // filter is applied (to facilitate in-place SuperRes).
+  uint8_t* superres_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the output of the Loop Restored
+  // planes (to facilitate in-place Loop Restoration).
+  uint8_t* loop_restoration_buffer_[kMaxPlanes];
+  YuvBuffer& cdef_border_;
+  // Buffer used to store the border pixels that are necessary for loop
+  // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
+  // for every 32x32 for chroma with subsampling). The indices of the rows that
+  // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of
+  // this buffer are never populated and never used.
+  // This buffer is used only when both of the following conditions are true:
+  //   (1). Loop Restoration is on.
+  //   (2). Cdef is on, or multi-threading is enabled for post filter.
+  YuvBuffer& loop_restoration_border_;
+  ThreadPool* const thread_pool_;
+
+  // Tracks the progress of the post filters.
+  int progress_row_ = -1;
+
+  // A block buffer to hold the input that is converted to uint16_t before
+  // cdef filtering. Only used in single threaded case. Y plane is processed
+  // separately. U and V planes are processed together. So it is sufficient to
+  // have this buffer to accommodate 2 planes at a time.
+  uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterSuperResTest;
+
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterHelperFuncTest;
+};
+
+extern template void PostFilter::ExtendFrame<uint8_t>(uint8_t* frame_start,
+                                                      int width, int height,
+                                                      ptrdiff_t stride,
+                                                      int left, int right,
+                                                      int top, int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start,
+                                                       int width, int height,
+                                                       ptrdiff_t stride,
+                                                       int left, int right,
+                                                       int top, int bottom);
+#endif
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_POST_FILTER_H_
diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc
new file mode 100644 (file)
index 0000000..ced4096
--- /dev/null
@@ -0,0 +1,693 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cassert>
+
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStep64x64 = 16;  // =64/4.
+constexpr int kCdefSkip = 8;
+
+constexpr uint8_t kCdefUvDirection[2][2][8] = {
+    {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
+    {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
+
+constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
+
+template <typename Pixel>
+void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
+                    bool is_frame_left, bool is_frame_right,
+                    uint16_t* const dst, const Pixel* left_border = nullptr) {
+  if (sizeof(src[0]) == sizeof(dst[0])) {
+    if (is_frame_left) {
+      Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
+    } else if (left_border == nullptr) {
+      memcpy(dst - kCdefBorder, src - kCdefBorder,
+             kCdefBorder * sizeof(dst[0]));
+    } else {
+      memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
+    }
+    memcpy(dst, src, block_width * sizeof(dst[0]));
+    if (is_frame_right) {
+      Memset(dst + block_width, kCdefLargeValue,
+             unit_width + kCdefBorder - block_width);
+    } else {
+      memcpy(dst + block_width, src + block_width,
+             (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
+    }
+    return;
+  }
+  if (is_frame_left) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = static_cast<uint16_t>(kCdefLargeValue);
+    }
+  } else if (left_border == nullptr) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = src[x];
+    }
+  } else {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = left_border[x + kCdefBorder];
+    }
+  }
+  for (int x = 0; x < block_width; ++x) {
+    dst[x] = src[x];
+  }
+  for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
+    dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
+  }
+}
+
+// GCC 13.x will report a false positive from the call to
+// ApplyCdefForOneSuperBlockRowHelper() with a nullptr in
+// ApplyCdefForOneSuperBlockRow(). The call to CopyPixels() in
+// ApplyCdefForOneUnit() is only made when thread_pool_ != nullptr and
+// border_columns[][] is a valid pointer.
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
+// |dst|.
+void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
+                int dst_stride, int width, int height, size_t pixel_size) {
+  assert(src != nullptr);
+  assert(dst != nullptr);
+  assert(height > 0);
+  int y = height;
+  do {
+    memcpy(dst, src, width * pixel_size);
+    src += src_stride;
+    dst += dst_stride;
+  } while (--y != 0);
+}
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+}  // namespace
+
+void PostFilter::SetupCdefBorder(int row4x4) {
+  assert(row4x4 >= 0);
+  assert(DoCdef());
+  int plane = kPlaneY;
+  do {
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t dst_stride = cdef_border_.stride(plane);
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels = SubsampledValue(
+        MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
+                                             subsampling_y_[plane]);
+    for (int i = 0; i < 4; ++i) {
+      const int row = kCdefBorderRows[subsampling_y_[plane]][i];
+      const int absolute_row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+      if (absolute_row >= plane_height) break;
+      const uint8_t* src =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+          row * src_stride;
+      uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
+      memcpy(dst, src, row_width);
+    }
+  } while (++plane < planes_);
+}
+
+template <typename Pixel>
+void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
+                                  int row4x4, int column4x4,
+                                  uint16_t* cdef_source, ptrdiff_t cdef_stride,
+                                  const bool y_plane,
+                                  const uint8_t border_columns[kMaxPlanes][256],
+                                  bool use_border_columns) {
+  assert(y_plane || planes_ == kMaxPlanes);
+  const int max_planes = y_plane ? 1 : kMaxPlanes;
+  const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
+  const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
+  const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
+  const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
+  const int plane_width = SubsampledValue(frame_header_.width, subsampling_x);
+  const int plane_height = SubsampledValue(frame_header_.height, subsampling_y);
+  const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
+  const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
+  // unit_width, unit_height are the same as block_width, block_height unless
+  // it reaches the frame boundary, where block_width < 64 or
+  // block_height < 64. unit_width, unit_height guarantee we build blocks on
+  // a multiple of 8.
+  const int unit_width = Align(block_width, 8 >> subsampling_x);
+  const int unit_height = Align(block_height, 8 >> subsampling_y);
+  const bool is_frame_left = column4x4 == 0;
+  const bool is_frame_right = start_x + block_width >= plane_width;
+  const bool is_frame_top = row4x4 == 0;
+  const bool is_frame_bottom = start_y + block_height >= plane_height;
+  const int y_offset = is_frame_top ? 0 : kCdefBorder;
+  const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
+
+  for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
+    uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
+                                           kCdefUnitSizeWithBorders *
+                                           kCdefUnitSizeWithBorders;
+    const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+    const Pixel* src_buffer =
+        reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
+        (start_y - y_offset) * src_stride + start_x;
+    const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
+    const Pixel* cdef_border =
+        (thread_pool_ == nullptr)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
+                  cdef_border_row_offset * cdef_border_stride + start_x;
+
+    // All the copying code will use negative indices for populating the left
+    // border. So the starting point is set to kCdefBorder.
+    cdef_src += kCdefBorder;
+
+    // Copy the top 2 rows as follows;
+    // If is_frame_top is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
+    if (is_frame_top) {
+      for (int y = 0; y < kCdefBorder; ++y) {
+        Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+               unit_width + 2 * kCdefBorder);
+        cdef_src += cdef_stride;
+      }
+    } else {
+      const Pixel* top_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int top_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+      for (int y = 0; y < kCdefBorder; ++y) {
+        CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        top_border += top_border_stride;
+        cdef_src += cdef_stride;
+        // We need to increment |src_buffer| and |cdef_border| in this loop to
+        // set them up for the subsequent loops below.
+        src_buffer += src_stride;
+        cdef_border += cdef_border_stride;
+      }
+    }
+
+    // Copy the body as follows;
+    // If multi-threaded filtering is off or if is_frame_bottom is true, all the
+    // rows are copied from |src_buffer|.
+    // Otherwise, the first |block_height|-kCdefBorder rows are copied from
+    // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
+    int y = block_height;
+    const int y_threshold =
+        (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
+    const Pixel* left_border =
+        (thread_pool_ == nullptr || !use_border_columns)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(border_columns[plane]);
+    do {
+      CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+                     is_frame_right, cdef_src, left_border);
+      cdef_src += cdef_stride;
+      src_buffer += src_stride;
+      if (left_border != nullptr) left_border += kCdefBorder;
+    } while (--y != y_threshold);
+
+    if (y > 0) {
+      assert(y == kCdefBorder);
+      // |cdef_border| now points to the top 2 rows of the current block. For
+      // the next loop, we need it to point to the bottom 2 rows of the
+      // current block. So increment it by 2 rows.
+      cdef_border += MultiplyBy2(cdef_border_stride);
+      for (int i = 0; i < kCdefBorder; ++i) {
+        CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        cdef_src += cdef_stride;
+        cdef_border += cdef_border_stride;
+      }
+    }
+
+    // Copy the bottom 2 rows as follows;
+    // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
+    y = 0;
+    if (is_frame_bottom) {
+      do {
+        Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+               unit_width + 2 * kCdefBorder);
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
+    } else {
+      const Pixel* bottom_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int bottom_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+      do {
+        CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        bottom_border += bottom_border_stride;
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
+    }
+  }
+}
+
+template <typename Pixel>
+void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
+                                     const int block_width4x4,
+                                     const int block_height4x4,
+                                     const int row4x4_start,
+                                     const int column4x4_start,
+                                     uint8_t border_columns[2][kMaxPlanes][256],
+                                     bool use_border_columns[2][2]) {
+  // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
+  static constexpr int kStep = 8;
+  static constexpr int kStep4x4 = 2;
+
+  int cdef_buffer_row_base_stride[kMaxPlanes];
+  uint8_t* cdef_buffer_row_base[kMaxPlanes];
+  int src_buffer_row_base_stride[kMaxPlanes];
+  const uint8_t* src_buffer_row_base[kMaxPlanes];
+  const uint16_t* cdef_src_row_base[kMaxPlanes];
+  int cdef_src_row_base_stride[kMaxPlanes];
+  int column_step[kMaxPlanes];
+  assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
+  int plane = kPlaneY;
+  do {
+    cdef_buffer_row_base[plane] =
+        GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
+    cdef_buffer_row_base_stride[plane] =
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
+                                                 row4x4_start, column4x4_start);
+    src_buffer_row_base_stride[plane] =
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    cdef_src_row_base[plane] =
+        cdef_block +
+        static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
+            kCdefUnitSizeWithBorders +
+        kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+    cdef_src_row_base_stride[plane] =
+        kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
+    column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
+  } while (++plane < planes_);
+
+  // |border_columns| contains two buffers. In each call to this function, we
+  // will use one of them as the "destination" for the current call. And the
+  // other one as the "source" for the current call (which would have been the
+  // "destination" of the previous call). We will use the src_index to populate
+  // the borders which were backed up in the previous call. We will use the
+  // dst_index to populate the borders to be used in the next call.
+  const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
+  const int border_columns_dst_index = border_columns_src_index ^ 1;
+
+  if (index == -1) {
+    if (thread_pool_ == nullptr) {
+      int plane = kPlaneY;
+      do {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      } while (++plane < planes_);
+    }
+    use_border_columns[border_columns_dst_index][0] = false;
+    use_border_columns[border_columns_dst_index][1] = false;
+    return;
+  }
+
+  const bool is_frame_right =
+      MultiplyBy4(column4x4_start + block_width4x4) >= frame_header_.width;
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    // Backup the last 2 columns for use in the next iteration.
+    use_border_columns[border_columns_dst_index][0] = true;
+    const uint8_t* src_line =
+        GetSourceBuffer(kPlaneY, row4x4_start,
+                        column4x4_start + block_width4x4) -
+        kCdefBorder * sizeof(Pixel);
+    assert(border_columns != nullptr);
+    CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
+               border_columns[border_columns_dst_index][kPlaneY],
+               kCdefBorder * sizeof(Pixel), kCdefBorder,
+               MultiplyBy4(block_height4x4), sizeof(Pixel));
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, true,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][0]);
+
+  // Stored direction used during the u/v pass.  If bit 3 is set, then block is
+  // a skip.
+  uint8_t direction_y[8 * 8];
+  int y_index = 0;
+
+  const uint8_t y_primary_strength =
+      frame_header_.cdef.y_primary_strength[index];
+  const uint8_t y_secondary_strength =
+      frame_header_.cdef.y_secondary_strength[index];
+  // y_strength_index is 0 for both primary and secondary strengths being
+  // non-zero, 1 for primary only, 2 for secondary only. This will be updated
+  // with y_primary_strength after variance is applied.
+  int y_strength_index = static_cast<int>(y_secondary_strength == 0);
+
+  const bool compute_direction_and_variance =
+      (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
+  const uint8_t* skip_row =
+      &cdef_skip_[row4x4_start >> 1][column4x4_start >> 4];
+  const int skip_stride = cdef_skip_.columns();
+  int row4x4 = row4x4_start;
+  do {
+    uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
+    const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+    const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
+    int column4x4 = column4x4_start;
+
+    if (*skip_row == 0) {
+      for (int i = 0; i < DivideBy2(block_width4x4); ++i, ++y_index) {
+        direction_y[y_index] = kCdefSkip;
+      }
+      if (thread_pool_ == nullptr) {
+        CopyPixels(src_buffer_base, frame_buffer_.stride(kPlaneY),
+                   cdef_buffer_base, frame_buffer_.stride(kPlaneY), 64, kStep,
+                   sizeof(Pixel));
+      }
+    } else {
+      do {
+        const int block_width = kStep;
+        const int block_height = kStep;
+        const int cdef_stride = frame_buffer_.stride(kPlaneY);
+        uint8_t* const cdef_buffer = cdef_buffer_base;
+        const uint16_t* const cdef_src = cdef_src_base;
+        const int src_stride = frame_buffer_.stride(kPlaneY);
+        const uint8_t* const src_buffer = src_buffer_base;
+
+        const uint8_t skip_shift = (column4x4 >> 1) & 0x7;
+        const bool skip = ((*skip_row >> skip_shift) & 1) == 0;
+        if (skip) {  // No cdef filtering.
+          direction_y[y_index] = kCdefSkip;
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
+        } else {
+          // Zero out residual skip flag.
+          direction_y[y_index] = 0;
+
+          int variance = 0;
+          if (compute_direction_and_variance) {
+            if (thread_pool_ == nullptr ||
+                row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
+              dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+                                  &variance);
+            } else if (sizeof(Pixel) == 2) {
+              dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
+                                  &direction_y[y_index], &variance);
+            } else {
+              // If we are in the last row4x4 for this unit, then the last two
+              // input rows have to come from |cdef_border_|. Since we already
+              // have |cdef_src| populated correctly, use that as the input
+              // for the direction process.
+              uint8_t direction_src[8][8];
+              const uint16_t* cdef_src_line = cdef_src;
+              for (auto& direction_src_line : direction_src) {
+                for (int i = 0; i < 8; ++i) {
+                  direction_src_line[i] = cdef_src_line[i];
+                }
+                cdef_src_line += kCdefUnitSizeWithBorders;
+              }
+              dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
+                                  &variance);
+            }
+          }
+          const int direction =
+              (y_primary_strength == 0) ? 0 : direction_y[y_index];
+          const int variance_strength =
+              ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
+                                     : 0;
+          const uint8_t primary_strength =
+              (variance != 0)
+                  ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
+                  : 0;
+          if ((primary_strength | y_secondary_strength) == 0) {
+            if (thread_pool_ == nullptr) {
+              CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                         block_width, block_height, sizeof(Pixel));
+            }
+          } else {
+            const int strength_index =
+                y_strength_index |
+                (static_cast<int>(primary_strength == 0) << 1);
+            dsp_.cdef_filters[1][strength_index](
+                cdef_src, kCdefUnitSizeWithBorders, block_height,
+                primary_strength, y_secondary_strength,
+                frame_header_.cdef.damping, direction, cdef_buffer,
+                cdef_stride);
+          }
+        }
+        cdef_buffer_base += column_step[kPlaneY];
+        src_buffer_base += column_step[kPlaneY];
+        cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
+
+        column4x4 += kStep4x4;
+        y_index++;
+      } while (column4x4 < column4x4_start + block_width4x4);
+    }
+
+    cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
+    src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+    cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
+    skip_row += skip_stride;
+    row4x4 += kStep4x4;
+  } while (row4x4 < row4x4_start + block_height4x4);
+
+  if (planes_ == kMaxPlanesMonochrome) {
+    return;
+  }
+
+  const uint8_t uv_primary_strength =
+      frame_header_.cdef.uv_primary_strength[index];
+  const uint8_t uv_secondary_strength =
+      frame_header_.cdef.uv_secondary_strength[index];
+
+  if ((uv_primary_strength | uv_secondary_strength) == 0) {
+    if (thread_pool_ == nullptr) {
+      for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      }
+    }
+    use_border_columns[border_columns_dst_index][1] = false;
+    return;
+  }
+
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    use_border_columns[border_columns_dst_index][1] = true;
+    for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+      // Backup the last 2 columns for use in the next iteration.
+      const uint8_t* src_line =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
+                          column4x4_start + block_width4x4) -
+          kCdefBorder * sizeof(Pixel);
+      CopyPixels(src_line, frame_buffer_.stride(plane),
+                 border_columns[border_columns_dst_index][plane],
+                 kCdefBorder * sizeof(Pixel), kCdefBorder,
+                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                 sizeof(Pixel));
+    }
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, false,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][1]);
+
+  // uv_strength_index is 0 for both primary and secondary strengths being
+  // non-zero, 1 for primary only, 2 for secondary only.
+  const int uv_strength_index =
+      (static_cast<int>(uv_primary_strength == 0) << 1) |
+      static_cast<int>(uv_secondary_strength == 0);
+  for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+    const int8_t subsampling_x = subsampling_x_[plane];
+    const int8_t subsampling_y = subsampling_y_[plane];
+    const int block_width = kStep >> subsampling_x;
+    const int block_height = kStep >> subsampling_y;
+    int row4x4 = row4x4_start;
+
+    y_index = 0;
+    do {
+      uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
+      const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+      const uint16_t* cdef_src_base = cdef_src_row_base[plane];
+      int column4x4 = column4x4_start;
+      do {
+        const int cdef_stride = frame_buffer_.stride(plane);
+        uint8_t* const cdef_buffer = cdef_buffer_base;
+        const int src_stride = frame_buffer_.stride(plane);
+        const uint8_t* const src_buffer = src_buffer_base;
+        const uint16_t* const cdef_src = cdef_src_base;
+        const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
+        int dual_cdef = 0;
+
+        if (skip) {  // No cdef filtering.
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
+        } else {
+          // Make sure block pair is not out of bounds.
+          if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
+            // Enable dual processing if subsampling_x is 1.
+            dual_cdef = subsampling_x;
+          }
+
+          int direction = (uv_primary_strength == 0)
+                              ? 0
+                              : kCdefUvDirection[subsampling_x][subsampling_y]
+                                                [direction_y[y_index]];
+
+          if (dual_cdef != 0) {
+            if (uv_primary_strength &&
+                direction_y[y_index] != direction_y[y_index + 1]) {
+              // Disable dual processing if the second block of the pair does
+              // not have the same direction.
+              dual_cdef = 0;
+            }
+
+            // Disable dual processing if the second block of the pair is a
+            // skip.
+            if (direction_y[y_index + 1] == kCdefSkip) {
+              dual_cdef = 0;
+            }
+          }
+
+          // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
+          const int width_index = dual_cdef | (subsampling_x ^ 1);
+          dsp_.cdef_filters[width_index][uv_strength_index](
+              cdef_src, kCdefUnitSizeWithBorders, block_height,
+              uv_primary_strength, uv_secondary_strength,
+              frame_header_.cdef.damping - 1, direction, cdef_buffer,
+              cdef_stride);
+        }
+        // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
+        // so adjust the pointers and indexes for 2 blocks.
+        cdef_buffer_base += column_step[plane] << dual_cdef;
+        src_buffer_base += column_step[plane] << dual_cdef;
+        cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
+        column4x4 += kStep4x4 << dual_cdef;
+        y_index += 1 << dual_cdef;
+      } while (column4x4 < column4x4_start + block_width4x4);
+
+      cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
+      src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+      cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
+      row4x4 += kStep4x4;
+    } while (row4x4 < row4x4_start + block_height4x4);
+  }
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
+    uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+    int row4x4, int block_height4x4) {
+  bool use_border_columns[2][2] = {};
+  const bool non_zero_index = frame_header_.cdef.bits > 0;
+  const int8_t* cdef_index =
+      non_zero_index ? cdef_index_[DivideBy16(row4x4)] : nullptr;
+  int column4x4 = 0;
+  do {
+    const int index = non_zero_index ? *cdef_index++ : 0;
+    const int block_width4x4 =
+        std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
+                                    block_height4x4, row4x4, column4x4,
+                                    border_columns, use_border_columns);
+    } else  // NOLINT
+#endif      // LIBGAV1_MAX_BITDEPTH >= 10
+    {
+      ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
+                                   block_height4x4, row4x4, column4x4,
+                                   border_columns, use_border_columns);
+    }
+    column4x4 += kStep64x64;
+  } while (column4x4 < frame_header_.columns4x4);
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
+                                              bool is_last_row) {
+  assert(row4x4_start >= 0);
+  assert(DoCdef());
+  int row4x4 = row4x4_start;
+  const int row4x4_limit = row4x4_start + sb4x4;
+  do {
+    if (row4x4 >= frame_header_.rows4x4) return;
+
+    // Apply cdef for the last 8 rows of the previous superblock row.
+    // One exception: If the superblock size is 128x128 and is_last_row is true,
+    // then we simply apply cdef for the entire superblock row without any lag.
+    // In that case, apply cdef for the previous superblock row only during the
+    // first iteration (row4x4 == row4x4_start).
+    if (row4x4 > 0 && (!is_last_row || row4x4 == row4x4_start)) {
+      assert(row4x4 >= 16);
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
+    }
+
+    // Apply cdef for the current superblock row. If this is the last superblock
+    // row we apply cdef for all the rows, otherwise we leave out the last 8
+    // rows.
+    const int block_height4x4 =
+        std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+    const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
+    if (height4x4 > 0) {
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
+                                         height4x4);
+    }
+    row4x4 += kStep64x64;
+  } while (row4x4 < row4x4_limit);
+}
+
+void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+  // Each border_column buffer has to store 64 rows and 2 columns for each
+  // plane. For 10bit, that is 64*2*2 = 256 bytes.
+  alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
+  while ((row4x4 = row4x4_atomic->fetch_add(
+              kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
+    const int block_height4x4 =
+        std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+    ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
+                                       block_height4x4);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/deblock.cc b/src/post_filter/deblock.cc
new file mode 100644 (file)
index 0000000..daee01c
--- /dev/null
@@ -0,0 +1,506 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <atomic>
+
+#include "src/post_filter.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t HevThresh(int level) { return DivideBy16(level); }
+
+// GetLoopFilterSize* functions depend on this exact ordering of the
+// LoopFilterSize enums.
+static_assert(dsp::kLoopFilterSize4 == 0, "");
+static_assert(dsp::kLoopFilterSize6 == 1, "");
+static_assert(dsp::kLoopFilterSize8 == 2, "");
+static_assert(dsp::kLoopFilterSize14 == 3, "");
+
+dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) {
+  // |filter_length| must be a power of 2.
+  assert((filter_length & (filter_length - 1)) == 0);
+  // This code is the branch free equivalent of:
+  //   if (filter_length == 4) return kLoopFilterSize4;
+  //   if (filter_length == 8) return kLoopFilterSize8;
+  //   return kLoopFilterSize14;
+  return static_cast<dsp::LoopFilterSize>(
+      MultiplyBy2(static_cast<int>(filter_length > 4)) +
+      static_cast<int>(filter_length > 8));
+}
+
+constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) {
+  // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4,
+  // otherwise size is kLoopFilterSize6.
+  return static_cast<dsp::LoopFilterSize>(filter_length != 4);
+}
+
+bool NonBlockBorderNeedsFilter(const BlockParameters& bp, int filter_id,
+                               uint8_t* const level) {
+  if (bp.deblock_filter_level[filter_id] == 0 || (bp.skip && bp.is_inter)) {
+    return false;
+  }
+  *level = bp.deblock_filter_level[filter_id];
+  return true;
+}
+
+// 7.14.5.
+void ComputeDeblockFilterLevelsHelper(
+    const ObuFrameHeader& frame_header, int segment_id, int level_index,
+    const int8_t delta_lf[kFrameLfCount],
+    uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
+  const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
+  uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0,
+                        kMaxLoopFilterValue);
+  const auto feature = static_cast<SegmentFeature>(
+      kSegmentFeatureLoopFilterYVertical + level_index);
+  level =
+      Clip3(level + frame_header.segmentation.feature_data[segment_id][feature],
+            0, kMaxLoopFilterValue);
+  if (!frame_header.loop_filter.delta_enabled) {
+    static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
+    memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
+    return;
+  }
+  assert(frame_header.loop_filter.delta_enabled);
+  const int shift = level >> 5;
+  deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
+      level +
+          LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
+                    shift),
+      0, kMaxLoopFilterValue);
+  // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
+  // not have to be populated.
+  for (int reference_frame = kReferenceFrameIntra + 1;
+       reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
+    for (int mode_id = 0; mode_id < 2; ++mode_id) {
+      deblock_filter_levels[reference_frame][mode_id] = Clip3(
+          level +
+              LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
+                            frame_header.loop_filter.mode_deltas[mode_id],
+                        shift),
+          0, kMaxLoopFilterValue);
+    }
+  }
+}
+
+}  // namespace
+
+void PostFilter::ComputeDeblockFilterLevels(
+    const int8_t delta_lf[kFrameLfCount],
+    uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+                                 [kNumReferenceFrameTypes][2]) const {
+  if (!DoDeblock()) return;
+  const int num_segments =
+      frame_header_.segmentation.enabled ? kMaxSegments : 1;
+  for (int segment_id = 0; segment_id < num_segments; ++segment_id) {
+    int level_index = 0;
+    for (; level_index < 2; ++level_index) {
+      ComputeDeblockFilterLevelsHelper(
+          frame_header_, segment_id, level_index, delta_lf,
+          deblock_filter_levels[segment_id][level_index]);
+    }
+    for (; level_index < kFrameLfCount; ++level_index) {
+      if (frame_header_.loop_filter.level[level_index] != 0) {
+        ComputeDeblockFilterLevelsHelper(
+            frame_header_, segment_id, level_index, delta_lf,
+            deblock_filter_levels[segment_id][level_index]);
+      }
+    }
+  }
+}
+
+bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                                    uint8_t* level, int* step,
+                                                    int* filter_length) const {
+  *step = kTransformHeight[inter_transform_sizes_[row4x4][column4x4]];
+  if (row4x4 == 0) return false;
+
+  const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+  const int row4x4_prev = row4x4 - 1;
+  assert(row4x4_prev >= 0);
+  const BlockParameters* bp_prev =
+      block_parameters_.Find(row4x4_prev, column4x4);
+
+  if (bp == bp_prev) {
+    // Not a border.
+    if (!NonBlockBorderNeedsFilter(*bp, 1, level)) return false;
+  } else {
+    const uint8_t level_this = bp->deblock_filter_level[1];
+    *level = level_this;
+    if (level_this == 0) {
+      const uint8_t level_prev = bp_prev->deblock_filter_level[1];
+      if (level_prev == 0) return false;
+      *level = level_prev;
+    }
+  }
+  const int step_prev =
+      kTransformHeight[inter_transform_sizes_[row4x4_prev][column4x4]];
+  *filter_length = std::min(*step, step_prev);
+  return true;
+}
+
+void PostFilter::GetHorizontalDeblockFilterEdgeInfoUV(
+    int row4x4, int column4x4, uint8_t* level_u, uint8_t* level_v, int* step,
+    int* filter_length) const {
+  const int subsampling_x = subsampling_x_[kPlaneU];
+  const int subsampling_y = subsampling_y_[kPlaneU];
+  row4x4 = GetDeblockPosition(row4x4, subsampling_y);
+  column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+  const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+  *level_u = 0;
+  *level_v = 0;
+  *step = kTransformHeight[bp->uv_transform_size];
+  if (row4x4 == subsampling_y) {
+    return;
+  }
+
+  bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+  bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+  assert(need_filter_u || need_filter_v);
+  const int filter_id_u =
+      kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeHorizontal];
+  const int filter_id_v =
+      kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeHorizontal];
+  const int row4x4_prev = row4x4 - (1 << subsampling_y);
+  assert(row4x4_prev >= 0);
+  const BlockParameters* bp_prev =
+      block_parameters_.Find(row4x4_prev, column4x4);
+
+  if (bp == bp_prev) {
+    // Not a border.
+    const bool skip = bp->skip && bp->is_inter;
+    need_filter_u =
+        need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+    need_filter_v =
+        need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+    if (!need_filter_u && !need_filter_v) return;
+    if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+    if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+    *filter_length = *step;
+    return;
+  }
+
+  // It is a border.
+  if (need_filter_u) {
+    const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+    *level_u = level_u_this;
+    if (level_u_this == 0) {
+      *level_u = bp_prev->deblock_filter_level[filter_id_u];
+    }
+  }
+  if (need_filter_v) {
+    const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+    *level_v = level_v_this;
+    if (level_v_this == 0) {
+      *level_v = bp_prev->deblock_filter_level[filter_id_v];
+    }
+  }
+  const int step_prev = kTransformHeight[bp_prev->uv_transform_size];
+  *filter_length = std::min(*step, step_prev);
+}
+
+bool PostFilter::GetVerticalDeblockFilterEdgeInfo(
+    int row4x4, int column4x4, BlockParameters* const* bp_ptr, uint8_t* level,
+    int* step, int* filter_length) const {
+  const BlockParameters* bp = *bp_ptr;
+  *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]];
+  if (column4x4 == 0) return false;
+
+  const int filter_id = 0;
+  const int column4x4_prev = column4x4 - 1;
+  assert(column4x4_prev >= 0);
+  const BlockParameters* bp_prev = *(bp_ptr - 1);
+  if (bp == bp_prev) {
+    // Not a border.
+    if (!NonBlockBorderNeedsFilter(*bp, filter_id, level)) return false;
+  } else {
+    // It is a border.
+    const uint8_t level_this = bp->deblock_filter_level[filter_id];
+    *level = level_this;
+    if (level_this == 0) {
+      const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+      if (level_prev == 0) return false;
+      *level = level_prev;
+    }
+  }
+  const int step_prev =
+      kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]];
+  *filter_length = std::min(*step, step_prev);
+  return true;
+}
+
+void PostFilter::GetVerticalDeblockFilterEdgeInfoUV(
+    int column4x4, BlockParameters* const* bp_ptr, uint8_t* level_u,
+    uint8_t* level_v, int* step, int* filter_length) const {
+  const int subsampling_x = subsampling_x_[kPlaneU];
+  column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+  const BlockParameters* bp = *bp_ptr;
+  *level_u = 0;
+  *level_v = 0;
+  *step = kTransformWidth[bp->uv_transform_size];
+  if (column4x4 == subsampling_x) {
+    return;
+  }
+
+  bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+  bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+  assert(need_filter_u || need_filter_v);
+  const int filter_id_u =
+      kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical];
+  const int filter_id_v =
+      kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical];
+  const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x));
+
+  if (bp == bp_prev) {
+    // Not a border.
+    const bool skip = bp->skip && bp->is_inter;
+    need_filter_u =
+        need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+    need_filter_v =
+        need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+    if (!need_filter_u && !need_filter_v) return;
+    if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+    if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+    *filter_length = *step;
+    return;
+  }
+
+  // It is a border.
+  if (need_filter_u) {
+    const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+    *level_u = level_u_this;
+    if (level_u_this == 0) {
+      *level_u = bp_prev->deblock_filter_level[filter_id_u];
+    }
+  }
+  if (need_filter_v) {
+    const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+    *level_v = level_v_this;
+    if (level_v_this == 0) {
+      *level_v = bp_prev->deblock_filter_level[filter_id_v];
+    }
+  }
+  const int step_prev = kTransformWidth[bp_prev->uv_transform_size];
+  *filter_length = std::min(*step, step_prev);
+}
+
+void PostFilter::HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
+                                         int column4x4_start,
+                                         int column4x4_end) {
+  const int height4x4 = row4x4_end - row4x4_start;
+  const int width4x4 = column4x4_end - column4x4_start;
+  if (height4x4 <= 0 || width4x4 <= 0) return;
+
+  const int column_step = 1;
+  const int src_step = 4 << pixel_size_log2_;
+  const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+  uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+  int row_step;
+  uint8_t level;
+  int filter_length;
+
+  const int width = frame_header_.width;
+  const int height = frame_header_.height;
+  for (int column4x4 = 0;
+       column4x4 < width4x4 && MultiplyBy4(column4x4_start + column4x4) < width;
+       column4x4 += column_step, src += src_step) {
+    uint8_t* src_row = src;
+    for (int row4x4 = 0;
+         row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+         row4x4 += row_step) {
+      const bool need_filter = GetHorizontalDeblockFilterEdgeInfo(
+          row4x4_start + row4x4, column4x4_start + column4x4, &level, &row_step,
+          &filter_length);
+      if (need_filter) {
+        assert(level > 0 && level <= kMaxLoopFilterValue);
+        const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+        dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+            src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+            HevThresh(level));
+      }
+      src_row += row_step * src_stride;
+      row_step = DivideBy4(row_step);
+    }
+  }
+
+  if (needs_chroma_deblock_) {
+    const int8_t subsampling_x = subsampling_x_[kPlaneU];
+    const int8_t subsampling_y = subsampling_y_[kPlaneU];
+    const int column_step = 1 << subsampling_x;
+    const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+    const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+    uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+    uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+    int row_step;
+    uint8_t level_u;
+    uint8_t level_v;
+    int filter_length;
+
+    for (int column4x4 = 0; column4x4 < width4x4 &&
+                            MultiplyBy4(column4x4_start + column4x4) < width;
+         column4x4 += column_step, src_u += src_step, src_v += src_step) {
+      uint8_t* src_row_u = src_u;
+      uint8_t* src_row_v = src_v;
+      for (int row4x4 = 0;
+           row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+           row4x4 += row_step) {
+        GetHorizontalDeblockFilterEdgeInfoUV(
+            row4x4_start + row4x4, column4x4_start + column4x4, &level_u,
+            &level_v, &row_step, &filter_length);
+        if (level_u != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+              src_row_u, src_stride_u, outer_thresh_[level_u],
+              inner_thresh_[level_u], HevThresh(level_u));
+        }
+        if (level_v != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+              src_row_v, src_stride_v, outer_thresh_[level_v],
+              inner_thresh_[level_v], HevThresh(level_v));
+        }
+        src_row_u += row_step * src_stride_u;
+        src_row_v += row_step * src_stride_v;
+        row_step = DivideBy4(row_step << subsampling_y);
+      }
+    }
+  }
+}
+
+void PostFilter::VerticalDeblockFilter(int row4x4_start, int row4x4_end,
+                                       int column4x4_start, int column4x4_end) {
+  const int height4x4 = row4x4_end - row4x4_start;
+  const int width4x4 = column4x4_end - column4x4_start;
+  if (height4x4 <= 0 || width4x4 <= 0) return;
+
+  const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(kPlaneY));
+  const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+  uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+  int column_step;
+  uint8_t level;
+  int filter_length;
+
+  BlockParameters* const* bp_row_base =
+      block_parameters_.Address(row4x4_start, column4x4_start);
+  const int bp_stride = block_parameters_.columns4x4();
+  const int column_step_shift = pixel_size_log2_;
+  const int width = frame_header_.width;
+  const int height = frame_header_.height;
+  for (int row4x4 = 0;
+       row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+       ++row4x4, src += row_stride, bp_row_base += bp_stride) {
+    uint8_t* src_row = src;
+    BlockParameters* const* bp = bp_row_base;
+    for (int column4x4 = 0; column4x4 < width4x4 &&
+                            MultiplyBy4(column4x4_start + column4x4) < width;
+         column4x4 += column_step, bp += column_step) {
+      const bool need_filter = GetVerticalDeblockFilterEdgeInfo(
+          row4x4_start + row4x4, column4x4_start + column4x4, bp, &level,
+          &column_step, &filter_length);
+      if (need_filter) {
+        assert(level > 0 && level <= kMaxLoopFilterValue);
+        const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+        dsp_.loop_filters[size][kLoopFilterTypeVertical](
+            src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+            HevThresh(level));
+      }
+      src_row += column_step << column_step_shift;
+      column_step = DivideBy4(column_step);
+    }
+  }
+
+  if (needs_chroma_deblock_) {
+    const int8_t subsampling_x = subsampling_x_[kPlaneU];
+    const int8_t subsampling_y = subsampling_y_[kPlaneU];
+    const int row_step = 1 << subsampling_y;
+    uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+    uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+    const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+    const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+    const ptrdiff_t row_stride_u = MultiplyBy4(frame_buffer_.stride(kPlaneU));
+    const ptrdiff_t row_stride_v = MultiplyBy4(frame_buffer_.stride(kPlaneV));
+    const LoopFilterType type = kLoopFilterTypeVertical;
+    int column_step;
+    uint8_t level_u;
+    uint8_t level_v;
+    int filter_length;
+
+    BlockParameters* const* bp_row_base = block_parameters_.Address(
+        GetDeblockPosition(row4x4_start, subsampling_y),
+        GetDeblockPosition(column4x4_start, subsampling_x));
+    const int bp_stride = block_parameters_.columns4x4() << subsampling_y;
+    for (int row4x4 = 0;
+         row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+         row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v,
+             bp_row_base += bp_stride) {
+      uint8_t* src_row_u = src_u;
+      uint8_t* src_row_v = src_v;
+      BlockParameters* const* bp = bp_row_base;
+      for (int column4x4 = 0; column4x4 < width4x4 &&
+                              MultiplyBy4(column4x4_start + column4x4) < width;
+           column4x4 += column_step, bp += column_step) {
+        GetVerticalDeblockFilterEdgeInfoUV(column4x4_start + column4x4, bp,
+                                           &level_u, &level_v, &column_step,
+                                           &filter_length);
+        if (level_u != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][type](
+              src_row_u, src_stride_u, outer_thresh_[level_u],
+              inner_thresh_[level_u], HevThresh(level_u));
+        }
+        if (level_v != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][type](
+              src_row_v, src_stride_v, outer_thresh_[level_v],
+              inner_thresh_[level_v], HevThresh(level_v));
+        }
+        src_row_u += column_step << column_step_shift;
+        src_row_v += column_step << column_step_shift;
+        column_step = DivideBy4(column_step << subsampling_x);
+      }
+    }
+  }
+}
+
+template <LoopFilterType loop_filter_type>
+void PostFilter::DeblockFilterWorker(std::atomic<int>* row4x4_atomic) {
+  const int rows4x4 = frame_header_.rows4x4;
+  const int columns4x4 = frame_header_.columns4x4;
+  int row4x4;
+  while ((row4x4 = row4x4_atomic->fetch_add(
+              kNum4x4InLoopFilterUnit, std::memory_order_relaxed)) < rows4x4) {
+    (this->*deblock_filter_func_[loop_filter_type])(
+        row4x4, row4x4 + kNum4x4InLoopFilterUnit, 0, columns4x4);
+  }
+}
+
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>(
+    std::atomic<int>* row4x4_atomic);
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>(
+    std::atomic<int>* row4x4_atomic);
+
+void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
+                                    int row4x4_start, int column4x4_start,
+                                    int column4x4_end, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoDeblock());
+  column4x4_end =
+      std::min(Align(column4x4_end, static_cast<int>(kNum4x4InLoopFilterUnit)),
+               frame_header_.columns4x4);
+  if (column4x4_start >= column4x4_end) return;
+  (this->*deblock_filter_func_[loop_filter_type])(
+      row4x4_start, row4x4_start + sb4x4, column4x4_start, column4x4_end);
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/deblock_thresholds.inc b/src/post_filter/deblock_thresholds.inc
new file mode 100644 (file)
index 0000000..ca12aaa
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Thresholds for the deblocking filter. Precomputed values of part of Section
+// 7.14.4 for all possible values of sharpness.
+
+constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = {
+    {1,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+     32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+     48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
+
+constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = {
+    {5,   7,   10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,
+     43,  46,  49,  52,  55,  58,  61,  64,  67,  70,  73,  76,  79,
+     82,  85,  88,  91,  94,  97,  100, 103, 106, 109, 112, 115, 118,
+     121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157,
+     160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+     64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,
+     90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112, 114,
+     116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,
+     63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,  87,
+     89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111, 113,
+     115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,
+     62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,
+     88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112,
+     114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  33,
+     35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,
+     61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,
+     87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111,
+     113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,
+     60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,
+     86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110,
+     112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,
+     59,  61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,
+     85,  87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109,
+     111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  30,
+     32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,
+     58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,
+     84,  86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108,
+     110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}};
diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc
new file mode 100644 (file)
index 0000000..b5e1432
--- /dev/null
@@ -0,0 +1,184 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneRow(
+    const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane,
+    const int plane_height, const int plane_width, const int unit_y,
+    const int unit_row, const int current_process_unit_height,
+    const int plane_unit_size, Pixel* dst_buffer) {
+  const int num_horizontal_units =
+      restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
+  const RestorationUnitInfo* const restoration_info =
+      restoration_info_->loop_restoration_info(static_cast<Plane>(plane),
+                                               unit_row * num_horizontal_units);
+  const bool in_place = DoCdef() || thread_pool_ != nullptr;
+  const Pixel* border = nullptr;
+  ptrdiff_t border_stride = 0;
+  src_buffer += unit_y * stride;
+  if (in_place) {
+    const int border_unit_y = std::max(
+        RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+    border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel);
+    border =
+        reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
+        border_unit_y * border_stride;
+  }
+  int unit_column = 0;
+  int column = 0;
+  do {
+    const int current_process_unit_width =
+        std::min(plane_unit_size, plane_width - column);
+    const Pixel* src = src_buffer + column;
+    unit_column = std::min(unit_column, num_horizontal_units - 1);
+    if (restoration_info[unit_column].type == kLoopRestorationTypeNone) {
+      Pixel* dst = dst_buffer + column;
+      if (in_place) {
+        int k = current_process_unit_height;
+        do {
+          memmove(dst, src, current_process_unit_width * sizeof(Pixel));
+          src += stride;
+          dst += stride;
+        } while (--k != 0);
+      } else {
+        CopyPlane(src, stride, current_process_unit_width,
+                  current_process_unit_height, dst, stride);
+      }
+    } else {
+      const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+      ptrdiff_t top_border_stride = stride;
+      const Pixel* bottom_border = src + current_process_unit_height * stride;
+      ptrdiff_t bottom_border_stride = stride;
+      const bool frame_bottom_border =
+          (unit_y + current_process_unit_height >= plane_height);
+      if (in_place && (unit_y != 0 || !frame_bottom_border)) {
+        const Pixel* loop_restoration_border = border + column;
+        if (unit_y != 0) {
+          top_border = loop_restoration_border;
+          top_border_stride = border_stride;
+          loop_restoration_border += 4 * border_stride;
+        }
+        if (!frame_bottom_border) {
+          bottom_border = loop_restoration_border +
+                          kRestorationVerticalBorder * border_stride;
+          bottom_border_stride = border_stride;
+        }
+      }
+#if LIBGAV1_MSAN
+      // The optimized loop filter may read past initialized values within the
+      // buffer.
+      RestorationBuffer restoration_buffer = {};
+#else
+      RestorationBuffer restoration_buffer;
+#endif
+      const LoopRestorationType type = restoration_info[unit_column].type;
+      assert(type == kLoopRestorationTypeSgrProj ||
+             type == kLoopRestorationTypeWiener);
+      const dsp::LoopRestorationFunc restoration_func =
+          dsp_.loop_restorations[type - 2];
+      restoration_func(restoration_info[unit_column], src, stride, top_border,
+                       top_border_stride, bottom_border, bottom_border_stride,
+                       current_process_unit_width, current_process_unit_height,
+                       &restoration_buffer, dst_buffer + column);
+    }
+    ++unit_column;
+    column += plane_unit_size;
+  } while (column < plane_width);
+}
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start,
+                                                         const int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoRestoration());
+  int plane = kPlaneY;
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  do {
+    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+    const int unit_height_offset =
+        kRestorationUnitOffset >> subsampling_y_[plane];
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    const int plane_width =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane];
+    const int plane_process_unit_height =
+        kRestorationUnitHeight >> subsampling_y_[plane];
+    int y = (row4x4_start == 0)
+                ? 0
+                : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) -
+                      unit_height_offset;
+    int expected_height = plane_process_unit_height -
+                          ((row4x4_start == 0) ? unit_height_offset : 0);
+    int current_process_unit_height;
+    for (int sb_y = 0; sb_y < sb4x4;
+         sb_y += 16, y += current_process_unit_height) {
+      if (y >= plane_height) break;
+      const int unit_row = std::min(
+          (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane],
+          restoration_info_->num_vertical_units(static_cast<Plane>(plane)) - 1);
+      current_process_unit_height = std::min(expected_height, plane_height - y);
+      expected_height = plane_process_unit_height;
+      ApplyLoopRestorationForOneRow<Pixel>(
+          reinterpret_cast<Pixel*>(superres_buffer_[plane]), stride,
+          static_cast<Plane>(plane), plane_height, plane_width, y, unit_row,
+          current_process_unit_height, plane_unit_size,
+          reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
+              y * stride);
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth_ >= 10) {
+    ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(row4x4_start, sb4x4);
+    return;
+  }
+#endif
+  ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(row4x4_start, sb4x4);
+}
+
+void PostFilter::ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  // Loop Restoration operates with a lag of 8 rows (4 for chroma with
+  // subsampling) and hence we need to make sure to cover the last 8 rows of the
+  // last superblock row. So we run this loop for an extra iteration to
+  // accomplish that.
+  const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit;
+  while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit,
+                                            std::memory_order_relaxed)) <
+         row4x4_end) {
+    CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit,
+                                   /*for_loop_restoration=*/true);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(
+          row4x4, kNum4x4InLoopRestorationUnit);
+      continue;
+    }
+#endif
+    ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(
+        row4x4, kNum4x4InLoopRestorationUnit);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc
new file mode 100644 (file)
index 0000000..9745a01
--- /dev/null
@@ -0,0 +1,647 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/post_filter/deblock_thresholds.inc"
+
+// Row indices of loop restoration border. This is used to populate the
+// |loop_restoration_border_| when either cdef is on or multithreading is
+// enabled. The dimension is subsampling_y.
+constexpr int kLoopRestorationBorderRows[2] = {54, 26};
+
+}  // namespace
+
+PostFilter::PostFilter(const ObuFrameHeader& frame_header,
+                       const ObuSequenceHeader& sequence_header,
+                       FrameScratchBuffer* const frame_scratch_buffer,
+                       YuvBuffer* const frame_buffer, const dsp::Dsp* dsp,
+                       int do_post_filter_mask)
+    : frame_header_(frame_header),
+      loop_restoration_(frame_header.loop_restoration),
+      dsp_(*dsp),
+      bitdepth_(sequence_header.color_config.bitdepth),
+      subsampling_x_{0, sequence_header.color_config.subsampling_x,
+                     sequence_header.color_config.subsampling_x},
+      subsampling_y_{0, sequence_header.color_config.subsampling_y,
+                     sequence_header.color_config.subsampling_y},
+      planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome
+                                                         : kMaxPlanes),
+      pixel_size_log2_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
+                                                         : sizeof(uint16_t)) -
+                       1),
+      inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
+      outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
+      needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 ||
+                            frame_header.loop_filter.level[kPlaneV + 1] != 0),
+      do_cdef_(DoCdef(frame_header, do_post_filter_mask)),
+      do_deblock_(DoDeblock(frame_header, do_post_filter_mask)),
+      do_restoration_(
+          DoRestoration(loop_restoration_, do_post_filter_mask, planes_)),
+      do_superres_(DoSuperRes(frame_header, do_post_filter_mask)),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      cdef_skip_(frame_scratch_buffer->cdef_skip),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+      restoration_info_(&frame_scratch_buffer->loop_restoration_info),
+      superres_coefficients_{
+          frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(),
+          frame_scratch_buffer
+              ->superres_coefficients
+                  [(sequence_header.color_config.is_monochrome ||
+                    sequence_header.color_config.subsampling_x == 0)
+                       ? kPlaneTypeY
+                       : kPlaneTypeUV]
+              .get()},
+      superres_line_buffer_(frame_scratch_buffer->superres_line_buffer),
+      block_parameters_(frame_scratch_buffer->block_parameters_holder),
+      frame_buffer_(*frame_buffer),
+      cdef_border_(frame_scratch_buffer->cdef_border),
+      loop_restoration_border_(frame_scratch_buffer->loop_restoration_border),
+      thread_pool_(
+          frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) {
+  const int8_t zero_delta_lf[kFrameLfCount] = {};
+  ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
+  if (DoSuperRes()) {
+    int plane = kPlaneY;
+    const int width = frame_header_.width;
+    const int upscaled_width_fh = frame_header_.upscaled_width;
+    do {
+      const int downscaled_width =
+          SubsampledValue(width, subsampling_x_[plane]);
+      const int upscaled_width =
+          SubsampledValue(upscaled_width_fh, subsampling_x_[plane]);
+      const int superres_width = downscaled_width << kSuperResScaleBits;
+      super_res_info_[plane].step =
+          (superres_width + upscaled_width / 2) / upscaled_width;
+      const int error =
+          super_res_info_[plane].step * upscaled_width - superres_width;
+      super_res_info_[plane].initial_subpixel_x =
+          ((-((upscaled_width - downscaled_width) << (kSuperResScaleBits - 1)) +
+            DivideBy2(upscaled_width)) /
+               upscaled_width +
+           (1 << (kSuperResExtraBits - 1)) - error / 2) &
+          kSuperResScaleMask;
+      super_res_info_[plane].upscaled_width = upscaled_width;
+    } while (++plane < planes_);
+    if (dsp->super_res_coefficients != nullptr) {
+      int plane = kPlaneY;
+      const int number_loops = (superres_coefficients_[kPlaneTypeY] ==
+                                superres_coefficients_[kPlaneTypeUV])
+                                   ? kMaxPlanesMonochrome
+                                   : static_cast<int>(kNumPlaneTypes);
+      do {
+        dsp->super_res_coefficients(super_res_info_[plane].upscaled_width,
+                                    super_res_info_[plane].initial_subpixel_x,
+                                    super_res_info_[plane].step,
+                                    superres_coefficients_[plane]);
+      } while (++plane < number_loops);
+    }
+  }
+  int plane = kPlaneY;
+  do {
+    loop_restoration_buffer_[plane] = frame_buffer_.data(plane);
+    cdef_buffer_[plane] = frame_buffer_.data(plane);
+    superres_buffer_[plane] = frame_buffer_.data(plane);
+    source_buffer_[plane] = frame_buffer_.data(plane);
+  } while (++plane < planes_);
+  if (DoCdef() || DoRestoration() || DoSuperRes()) {
+    plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
+      int horizontal_shift = 0;
+      int vertical_shift = 0;
+      if (DoRestoration() &&
+          loop_restoration_.type[plane] != kLoopRestorationTypeNone) {
+        horizontal_shift += frame_buffer_.alignment();
+        if (!DoCdef() && thread_pool_ == nullptr) {
+          vertical_shift += kRestorationVerticalBorder;
+        }
+        superres_buffer_[plane] +=
+            vertical_shift * frame_buffer_.stride(plane) +
+            (horizontal_shift << pixel_size_log2);
+      }
+      if (DoSuperRes()) {
+        vertical_shift += kSuperResVerticalBorder;
+      }
+      cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+                             (horizontal_shift << pixel_size_log2);
+      if (DoCdef() && thread_pool_ == nullptr) {
+        horizontal_shift += frame_buffer_.alignment();
+        vertical_shift += kCdefBorder;
+      }
+      assert(horizontal_shift <= frame_buffer_.right_border(plane));
+      assert(vertical_shift <= frame_buffer_.bottom_border(plane));
+      source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+                               (horizontal_shift << pixel_size_log2);
+    } while (++plane < planes_);
+  }
+}
+
+// The following example illustrates how ExtendFrame() extends a frame.
+// Suppose the frame width is 8 and height is 4, and left, right, top, and
+// bottom are all equal to 3.
+//
+// Before:
+//
+//       ABCDEFGH
+//       IJKLMNOP
+//       QRSTUVWX
+//       YZabcdef
+//
+// After:
+//
+//   AAA|ABCDEFGH|HHH  [3]
+//   AAA|ABCDEFGH|HHH
+//   AAA|ABCDEFGH|HHH
+//   ---+--------+---
+//   AAA|ABCDEFGH|HHH  [1]
+//   III|IJKLMNOP|PPP
+//   QQQ|QRSTUVWX|XXX
+//   YYY|YZabcdef|fff
+//   ---+--------+---
+//   YYY|YZabcdef|fff  [2]
+//   YYY|YZabcdef|fff
+//   YYY|YZabcdef|fff
+//
+// ExtendFrame() first extends the rows to the left and to the right[1]. Then
+// it copies the extended last row to the bottom borders[2]. Finally it copies
+// the extended first row to the top borders[3].
+// static
+template <typename Pixel>
+void PostFilter::ExtendFrame(Pixel* const frame_start, const int width,
+                             const int height, const ptrdiff_t stride,
+                             const int left, const int right, const int top,
+                             const int bottom) {
+  Pixel* src = frame_start;
+  // Copy to left and right borders.
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src, width, left, right);
+    src += stride;
+  } while (--y != 0);
+  // Copy to bottom borders. For performance we copy |stride| pixels
+  // (including some padding pixels potentially) in each row, ending at the
+  // bottom right border pixel. In the diagram the asterisks indicate padding
+  // pixels.
+  //
+  // |<--- stride --->|
+  // **YYY|YZabcdef|fff <-- Copy from the extended last row.
+  // -----+--------+---
+  // **YYY|YZabcdef|fff
+  // **YYY|YZabcdef|fff
+  // **YYY|YZabcdef|fff <-- bottom right border pixel
+  assert(src == frame_start + height * stride);
+  Pixel* dst = src - left;
+  src = dst - stride;
+  for (int y = 0; y < bottom; ++y) {
+    memcpy(dst, src, sizeof(Pixel) * stride);
+    dst += stride;
+  }
+  // Copy to top borders. For performance we copy |stride| pixels (including
+  // some padding pixels potentially) in each row, starting from the top left
+  // border pixel. In the diagram the asterisks indicate padding pixels.
+  //
+  // +-- top left border pixel
+  // |
+  // v
+  // AAA|ABCDEFGH|HHH**
+  // AAA|ABCDEFGH|HHH**
+  // AAA|ABCDEFGH|HHH**
+  // ---+--------+-----
+  // AAA|ABCDEFGH|HHH** <-- Copy from the extended first row.
+  // |<--- stride --->|
+  src = frame_start - left;
+  dst = frame_start - left - top * stride;
+  for (int y = 0; y < top; ++y) {
+    memcpy(dst, src, sizeof(Pixel) * stride);
+    dst += stride;
+  }
+}
+
+template void PostFilter::ExtendFrame<uint8_t>(uint8_t* const frame_start,
+                                               const int width,
+                                               const int height,
+                                               const ptrdiff_t stride,
+                                               const int left, const int right,
+                                               const int top, const int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void PostFilter::ExtendFrame<uint16_t>(
+    uint16_t* const frame_start, const int width, const int height,
+    const ptrdiff_t stride, const int left, const int right, const int top,
+    const int bottom);
+#endif
+
+void PostFilter::ExtendFrameBoundary(uint8_t* const frame_start,
+                                     const int width, const int height,
+                                     const ptrdiff_t stride, const int left,
+                                     const int right, const int top,
+                                     const int bottom) const {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth_ >= 10) {
+    ExtendFrame<uint16_t>(reinterpret_cast<uint16_t*>(frame_start), width,
+                          height, stride >> 1, left, right, top, bottom);
+    return;
+  }
+#endif
+  ExtendFrame<uint8_t>(frame_start, width, height, stride, left, right, top,
+                       bottom);
+}
+
+void PostFilter::ExtendBordersForReferenceFrame() {
+  if (frame_header_.refresh_frame_flags == 0) return;
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels &&
+           frame_buffer_.right_border(plane) >= kMinRightBorderPixels &&
+           frame_buffer_.top_border(plane) >= kMinTopBorderPixels &&
+           frame_buffer_.bottom_border(plane) >= kMinBottomBorderPixels);
+    // plane subsampling_x_ left_border
+    //   Y        N/A         64, 48
+    //  U,V        0          64, 48
+    //  U,V        1          32, 16
+    assert(frame_buffer_.left_border(plane) >= 16);
+    // The |left| argument to ExtendFrameBoundary() must be at least
+    // kMinLeftBorderPixels (13) for warp.
+    static_assert(16 >= kMinLeftBorderPixels, "");
+    ExtendFrameBoundary(
+        frame_buffer_.data(plane), plane_width, plane_height,
+        frame_buffer_.stride(plane), frame_buffer_.left_border(plane),
+        frame_buffer_.right_border(plane), frame_buffer_.top_border(plane),
+        frame_buffer_.bottom_border(plane));
+  } while (++plane < planes_);
+}
+
+void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
+  const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+  const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
+  const int row_offset = DivideBy4(row4x4);
+  const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+  uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride;
+  const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
+                                         subsampling_x_[plane]);
+  const int row_width = num_pixels << pixel_size_log2_;
+  int last_valid_row = -1;
+  const int plane_height =
+      SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+  int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+  const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+  for (int i = 0; i < 4; ++i, ++row) {
+    if (absolute_row + i >= plane_height) {
+      if (last_valid_row == -1) break;
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      row = last_valid_row;
+    }
+    memcpy(dst, src + row * src_stride, row_width);
+    last_valid_row = row;
+    dst += dst_stride;
+  }
+}
+
+void PostFilter::CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+                                                bool for_loop_restoration) {
+  // Number of rows to be subtracted from the start position described by
+  // row4x4. We always lag by 8 rows (to account for in-loop post filters).
+  const int row_offset = (row4x4 == 0) ? 0 : 8;
+  // Number of rows to be subtracted from the height described by sb4x4.
+  const int height_offset = (row4x4 == 0) ? 8 : 0;
+  // If cdef is off and post filter multithreading is off, then loop restoration
+  // needs 2 extra rows for the bottom border in each plane.
+  const int extra_rows =
+      (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0;
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane];
+    assert(row >= 0);
+    if (row >= plane_height) break;
+    const int num_rows =
+        std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset,
+                                 subsampling_y_[plane]) +
+                     extra_rows,
+                 plane_height - row);
+    // We only need to track the progress of the Y plane since the progress of
+    // the U and V planes will be inferred from the progress of the Y plane.
+    if (!for_loop_restoration && plane == kPlaneY) {
+      progress_row_ = row + num_rows;
+    }
+    const bool copy_bottom = row + num_rows == plane_height;
+    const ptrdiff_t stride = frame_buffer_.stride(plane);
+    uint8_t* const start = (for_loop_restoration ? superres_buffer_[plane]
+                                                 : frame_buffer_.data(plane)) +
+                           row * stride;
+#if LIBGAV1_MSAN
+    const int right_padding =
+        (frame_buffer_.stride(plane) >> static_cast<int>(bitdepth_ > 8)) -
+        ((frame_buffer_.left_border(plane) + frame_buffer_.width(plane) +
+          frame_buffer_.right_border(plane)));
+    const int padded_right_border_size =
+        frame_buffer_.right_border(plane) + right_padding;
+    // The optimized loop restoration code may read into the next row's left
+    // border depending on the start of the last superblock and the size of the
+    // right border. This is safe as the post filter is applied after
+    // reconstruction is complete and the threaded implementations do not read
+    // from the left border.
+    const int left_border_overread =
+        (for_loop_restoration && padded_right_border_size < 64)
+            ? 63 - padded_right_border_size
+            : 0;
+    assert(!for_loop_restoration || left_border_overread == 0 ||
+           (frame_buffer_.bottom_border(plane) > 0 &&
+            left_border_overread <= frame_buffer_.left_border(plane)));
+    const int left_border = (for_loop_restoration && left_border_overread == 0)
+                                ? kRestorationHorizontalBorder
+                                : frame_buffer_.left_border(plane);
+    // The optimized loop restoration code will overread the visible frame
+    // buffer into the right border. Extend the right boundary further to
+    // prevent msan warnings.
+    const int right_border = for_loop_restoration
+                                 ? std::min(padded_right_border_size, 63)
+                                 : frame_buffer_.right_border(plane);
+#else
+    const int left_border = for_loop_restoration
+                                ? kRestorationHorizontalBorder
+                                : frame_buffer_.left_border(plane);
+    const int right_border = for_loop_restoration
+                                 ? kRestorationHorizontalBorder
+                                 : frame_buffer_.right_border(plane);
+#endif
+    const int top_border =
+        (row == 0) ? (for_loop_restoration ? kRestorationVerticalBorder
+                                           : frame_buffer_.top_border(plane))
+                   : 0;
+    const int bottom_border =
+        copy_bottom
+            ? (for_loop_restoration ? kRestorationVerticalBorder
+                                    : frame_buffer_.bottom_border(plane))
+            : 0;
+    ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border,
+                        right_border, top_border, bottom_border);
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
+  assert(row4x4 >= 0);
+  assert(!DoCdef());
+  assert(DoRestoration());
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  int plane = kPlaneY;
+  do {
+    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+    const int absolute_row =
+        (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+    const uint8_t* src =
+        GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) +
+        row * src_stride;
+    const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+    uint8_t* dst =
+        loop_restoration_border_.data(plane) + row_offset * dst_stride;
+    for (int i = 0; i < 4; ++i) {
+      memcpy(dst, src, row_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (bitdepth_ >= 10) {
+        ExtendLine<uint16_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                             kRestorationHorizontalBorder);
+      } else  // NOLINT.
+#endif
+        ExtendLine<uint8_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                            kRestorationHorizontalBorder);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      if (absolute_row + i < plane_height - 1) src += src_stride;
+      dst += dst_stride;
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoCdef());
+  assert(DoRestoration());
+  for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
+    const int row4x4 = row4x4_start + sb_y;
+    const int row_offset_start = DivideBy4(row4x4);
+    const std::array<uint8_t*, kMaxPlanes> dst = {
+        loop_restoration_border_.data(kPlaneY) +
+            row_offset_start * static_cast<ptrdiff_t>(
+                                   loop_restoration_border_.stride(kPlaneY)),
+        loop_restoration_border_.data(kPlaneU) +
+            row_offset_start * static_cast<ptrdiff_t>(
+                                   loop_restoration_border_.stride(kPlaneU)),
+        loop_restoration_border_.data(kPlaneV) +
+            row_offset_start * static_cast<ptrdiff_t>(
+                                   loop_restoration_border_.stride(kPlaneV))};
+    // If SuperRes is enabled, then we apply SuperRes for the rows to be copied
+    // directly with |loop_restoration_border_| as the destination. Otherwise,
+    // we simply copy the rows.
+    if (DoSuperRes()) {
+      std::array<uint8_t*, kMaxPlanes> src;
+      std::array<int, kMaxPlanes> rows;
+      const int height = frame_header_.height;
+      int plane = kPlaneY;
+      do {
+        if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+          rows[plane] = 0;
+          continue;
+        }
+        const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+        const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+        const int absolute_row =
+            (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+        src[plane] = GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+                     row * static_cast<ptrdiff_t>(frame_buffer_.stride(plane));
+        rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
+      } while (++plane < planes_);
+      ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst,
+                    /*dst_is_loop_restoration_border=*/true);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      plane = kPlaneY;
+      do {
+        if (rows[plane] == 0 || rows[plane] >= 4) continue;
+        const ptrdiff_t stride = loop_restoration_border_.stride(plane);
+        uint8_t* dst_line = dst[plane] + rows[plane] * stride;
+        const uint8_t* const src_line = dst_line - stride;
+        const int upscaled_width = super_res_info_[plane].upscaled_width
+                                   << pixel_size_log2_;
+        for (int i = rows[plane]; i < 4; ++i) {
+          memcpy(dst_line, src_line, upscaled_width);
+          dst_line += stride;
+        }
+      } while (++plane < planes_);
+    } else {
+      int plane = kPlaneY;
+      do {
+        CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
+      } while (++plane < planes_);
+    }
+    // Extend the left and right boundaries needed for loop restoration.
+    const int upscaled_width = frame_header_.upscaled_width;
+    int plane = kPlaneY;
+    do {
+      if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+        continue;
+      }
+      uint8_t* dst_line = dst[plane];
+      const int plane_width =
+          SubsampledValue(upscaled_width, subsampling_x_[plane]);
+      for (int i = 0; i < 4; ++i) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        if (bitdepth_ >= 10) {
+          ExtendLine<uint16_t>(dst_line, plane_width,
+                               kRestorationHorizontalBorder,
+                               kRestorationHorizontalBorder);
+        } else  // NOLINT.
+#endif
+        {
+          ExtendLine<uint8_t>(dst_line, plane_width,
+                              kRestorationHorizontalBorder,
+                              kRestorationHorizontalBorder);
+        }
+        dst_line += loop_restoration_border_.stride(plane);
+      }
+    } while (++plane < planes_);
+  }
+}
+
+void PostFilter::RunJobs(WorkerFunction worker) {
+  std::atomic<int> row4x4(0);
+  const int num_workers = thread_pool_->num_threads();
+  BlockingCounter pending_workers(num_workers);
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() {
+      (this->*worker)(&row4x4);
+      pending_workers.Decrement();
+    });
+  }
+  // Run the jobs on the current thread.
+  (this->*worker)(&row4x4);
+  // Wait for the threadpool jobs to finish.
+  pending_workers.Wait();
+}
+
+void PostFilter::ApplyFilteringThreaded() {
+  if (DoDeblock()) {
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>);
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>);
+  }
+  if (DoCdef() && DoRestoration()) {
+    for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit);
+    }
+  }
+  if (DoCdef()) {
+    for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupCdefBorder(row4x4);
+    }
+    RunJobs(&PostFilter::ApplyCdefWorker);
+  }
+  if (DoSuperRes()) ApplySuperResThreaded();
+  if (DoRestoration()) {
+    if (!DoCdef()) {
+      int row4x4 = 0;
+      do {
+        SetupLoopRestorationBorder(row4x4);
+        row4x4 += kNum4x4InLoopFilterUnit;
+      } while (row4x4 < frame_header_.rows4x4);
+    }
+    RunJobs(&PostFilter::ApplyLoopRestorationWorker);
+  }
+  ExtendBordersForReferenceFrame();
+}
+
+int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
+                                                  bool is_last_row,
+                                                  bool do_deblock) {
+  if (row4x4 < 0) return -1;
+  if (DoDeblock() && do_deblock) {
+    VerticalDeblockFilter(row4x4, row4x4 + sb4x4, 0, frame_header_.columns4x4);
+    HorizontalDeblockFilter(row4x4, row4x4 + sb4x4, 0,
+                            frame_header_.columns4x4);
+  }
+  if (DoRestoration() && DoCdef()) {
+    SetupLoopRestorationBorder(row4x4, sb4x4);
+  }
+  if (DoCdef()) {
+    ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+  }
+  if (DoSuperRes()) {
+    ApplySuperResForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+  }
+  if (DoRestoration()) {
+    CopyBordersForOneSuperBlockRow(row4x4, sb4x4, true);
+    ApplyLoopRestoration(row4x4, sb4x4);
+    if (is_last_row) {
+      // Loop restoration operates with a lag of 8 rows. So make sure to cover
+      // all the rows of the last superblock row.
+      CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, true);
+      ApplyLoopRestoration(row4x4 + sb4x4, 16);
+    }
+  }
+  if (frame_header_.refresh_frame_flags != 0 && DoBorderExtensionInLoop()) {
+    CopyBordersForOneSuperBlockRow(row4x4, sb4x4, false);
+    if (is_last_row) {
+      CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, false);
+    }
+  }
+  if (is_last_row && !DoBorderExtensionInLoop()) {
+    ExtendBordersForReferenceFrame();
+  }
+  return is_last_row ? frame_header_.height : progress_row_;
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc
new file mode 100644 (file)
index 0000000..2133a8a
--- /dev/null
@@ -0,0 +1,212 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
+                               const std::array<int, kMaxPlanes>& rows,
+                               const int line_buffer_row,
+                               const std::array<uint8_t*, kMaxPlanes>& dst,
+                               bool dst_is_loop_restoration_border /*=false*/) {
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      auto* input = reinterpret_cast<uint16_t*>(src[plane]);
+      auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
+      const ptrdiff_t input_stride =
+          frame_buffer_.stride(plane) / sizeof(uint16_t);
+      const ptrdiff_t output_stride =
+          (dst_is_loop_restoration_border
+               ? loop_restoration_border_.stride(plane)
+               : frame_buffer_.stride(plane)) /
+          sizeof(uint16_t);
+      if (rows[plane] > 0) {
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       input, input_stride, rows[plane], plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step, output, output_stride);
+      }
+      // In the multi-threaded case, the |superres_line_buffer_| holds the last
+      // input row. Apply SuperRes for that row.
+      if (line_buffer_row >= 0) {
+        auto* const line_buffer_start =
+            reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
+            line_buffer_row * superres_line_buffer_.stride(plane) /
+                sizeof(uint16_t) +
+            kSuperResHorizontalBorder;
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       line_buffer_start, /*source_stride=*/0,
+                       /*height=*/1, plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step,
+                       output + rows[plane] * output_stride, /*dest_stride=*/0);
+      }
+      continue;
+    }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    uint8_t* input = src[plane];
+    uint8_t* output = dst[plane];
+    const ptrdiff_t input_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t output_stride = dst_is_loop_restoration_border
+                                        ? loop_restoration_border_.stride(plane)
+                                        : frame_buffer_.stride(plane);
+    if (rows[plane] > 0) {
+      dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                     input, input_stride, rows[plane], plane_width,
+                     super_res_info_[plane].upscaled_width,
+                     super_res_info_[plane].initial_subpixel_x,
+                     super_res_info_[plane].step, output, output_stride);
+    }
+    // In the multi-threaded case, the |superres_line_buffer_| holds the last
+    // input row. Apply SuperRes for that row.
+    if (line_buffer_row >= 0) {
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          kSuperResHorizontalBorder;
+      dsp_.super_res(
+          superres_coefficients_[static_cast<int>(plane != 0)],
+          line_buffer_start, /*source_stride=*/0,
+          /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+          super_res_info_[plane].initial_subpixel_x,
+          super_res_info_[plane].step, output + rows[plane] * output_stride,
+          /*dest_stride=*/0);
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
+                                                  bool is_last_row) {
+  assert(row4x4_start >= 0);
+  assert(DoSuperRes());
+  // If not doing cdef, then LR needs two rows of border with superres applied.
+  const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
+  std::array<uint8_t*, kMaxPlanes> src;
+  std::array<uint8_t*, kMaxPlanes> dst;
+  std::array<int, kMaxPlanes> rows;
+  const int num_rows4x4 =
+      std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
+      (is_last_row ? 0 : 2);
+  if (row4x4_start > 0) {
+    const int row4x4 = row4x4_start - 2;
+    int plane = kPlaneY;
+    do {
+      const int row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
+      const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| subtraction is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      // Apply superres for the last 8-|num_rows_extra| rows of the previous
+      // superblock.
+      rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
+      // Apply superres for the current superblock row (except for the last
+      // 8-|num_rows_extra| rows).
+      rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                     (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
+  } else {
+    // Apply superres for the current superblock row (except for the last
+    // 8-|num_rows_extra| rows).
+    int plane = kPlaneY;
+    do {
+      const ptrdiff_t row_offset =
+          (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
+          frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| addition is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                    (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
+  }
+  ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+}
+
+void PostFilter::ApplySuperResThreaded() {
+  int num_threads = thread_pool_->num_threads() + 1;
+  // The number of rows that will be processed by each thread in the thread pool
+  // (other than the current thread).
+  int thread_pool_rows = frame_header_.height / num_threads;
+  thread_pool_rows = std::max(thread_pool_rows, 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++thread_pool_rows;
+  }
+  // Adjust the number of threads to what we really need.
+  num_threads = Clip3(frame_header_.height / thread_pool_rows, 1, num_threads);
+  // For the current thread, we round up to process all the remaining rows.
+  int current_thread_rows =
+      frame_header_.height - thread_pool_rows * (num_threads - 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++current_thread_rows;
+  }
+  assert(current_thread_rows > 0);
+  BlockingCounter pending_workers(num_threads - 1);
+  for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
+       ++line_buffer_row, row_start += thread_pool_rows) {
+    std::array<uint8_t*, kMaxPlanes> src;
+    std::array<uint8_t*, kMaxPlanes> dst;
+    std::array<int, kMaxPlanes> rows;
+    int plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
+      src[plane] =
+          GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      dst[plane] =
+          GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      rows[plane] =
+          (((line_buffer_row < num_threads - 1) ? thread_pool_rows
+                                                : current_thread_rows) >>
+           subsampling_y_[plane]) -
+          1;
+      const int plane_width =
+          MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+      uint8_t* const input =
+          src[plane] + rows[plane] * frame_buffer_.stride(plane);
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          (kSuperResHorizontalBorder << pixel_size_log2);
+      memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
+    } while (++plane < planes_);
+    if (line_buffer_row < num_threads - 1) {
+      thread_pool_->Schedule(
+          [this, src, rows, line_buffer_row, dst, &pending_workers]() {
+            ApplySuperRes(src, rows, line_buffer_row, dst);
+            pending_workers.Decrement();
+          });
+    } else {
+      ApplySuperRes(src, rows, line_buffer_row, dst);
+    }
+  }
+  // Wait for the threadpool jobs to finish.
+  pending_workers.Wait();
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter_test.cc b/src/post_filter_test.cc
new file mode 100644 (file)
index 0000000..034d31f
--- /dev/null
@@ -0,0 +1,1080 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/super_res.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/obu_parser.h"
+#include "src/threading_strategy.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kApplyCdefName[] = "ApplyCdef";
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+constexpr int kMaxTestFrameSize = 1920 * 1080;
+
+int GetIdFromInputParam(int subsampling_x, int subsampling_y, int height) {
+  int id = subsampling_x * 8 + subsampling_y * 4;
+  if (height == 288) {
+    id += 0;
+  } else if (height == 480) {
+    id += 1;
+  } else if (height == 1080) {
+    id += 2;
+  } else {
+    id += 3;
+  }
+  return id;
+}
+
+const char* GetSuperResDigest8bpp(int id, int plane) {
+  static const char* const kDigestSuperRes[][kMaxPlanes] = {
+      {
+          // all input is 0.
+          "ff5f7a63d3b1f9176e216eb01a0387ad",  // kPlaneY.
+          "38b6551d7ac3e86c8af407d5a1aa36dc",  // kPlaneU.
+          "38b6551d7ac3e86c8af407d5a1aa36dc",  // kPlaneV.
+      },
+      {
+          // all input is 1.
+          "819f21dcce0e779180bbd613a9e3543c",  // kPlaneY.
+          "e784bfa8f517d83b014c3dcd45b780a5",  // kPlaneU.
+          "e784bfa8f517d83b014c3dcd45b780a5",  // kPlaneV.
+      },
+      {
+          // all input is 128.
+          "2d6ea5b39f9168d56c2e2b8846d208ec",  // kPlaneY.
+          "8030b6e70f1544efbc37b902d3f88bd3",  // kPlaneU.
+          "8030b6e70f1544efbc37b902d3f88bd3",  // kPlaneV.
+      },
+      {
+          // all input is 255.
+          "5c0b4bc50e0980dc6ba7c042d3b50a5e",  // kPlaneY.
+          "3c566ef847c45be09ddac297123a3bad",  // kPlaneU.
+          "3c566ef847c45be09ddac297123a3bad",  // kPlaneV.
+      },
+      {
+          // random input.
+          "50514467dd6a5c3a8268eddaa542c41f",  // kPlaneY.
+          "3ce720c2b5b44928e1477b11040e5c00",  // kPlaneU.
+          "3ce720c2b5b44928e1477b11040e5c00",  // kPlaneV.
+      },
+  };
+  return kDigestSuperRes[id][plane];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetSuperResDigest10bpp(int id, int plane) {
+  // Digests are in Y/U/V order.
+  static const char* const kDigestSuperRes[][kMaxPlanes] = {
+      {
+          // all input is 0.
+          "fccb1f57b252b1a86d335aea929d1d58",
+          "2f244a56091c9705794e92e6bcc38058",
+          "2f244a56091c9705794e92e6bcc38058",
+      },
+      {
+          // all input is 1.
+          "de8556204999d6e4bf74cfdde61a095b",
+          "e7d0f4ce6df81c46de95da7790a67384",
+          "e7d0f4ce6df81c46de95da7790a67384",
+      },
+      {
+          // all input is 512.
+          "d3b6980363eb9b808885537b3485af87",
+          "bcffddb26210da6861e7b31414e58b77",
+          "bcffddb26210da6861e7b31414e58b77",
+      },
+      {
+          // all input is 1023.
+          "ce0762aeee1cdef1db101e4ca39bcbd6",
+          "33aeaa7f5d7c032e3dfda43925c3dcb2",
+          "33aeaa7f5d7c032e3dfda43925c3dcb2",
+      },
+      {
+          // random input.
+          "63c701bceb187ffa535be15ae58f8171",
+          "f570e30e9ea8d2a1e6d99202cd2f8994",
+          "f570e30e9ea8d2a1e6d99202cd2f8994",
+      },
+  };
+  return kDigestSuperRes[id][plane];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetSuperResDigest12bpp(int id, int plane) {
+  // Digests are in Y/U/V order.
+  static const char* const kDigestSuperRes[][kMaxPlanes] = {
+      {
+          // all input is 0.
+          "fccb1f57b252b1a86d335aea929d1d58",
+          "2f244a56091c9705794e92e6bcc38058",
+          "2f244a56091c9705794e92e6bcc38058",
+      },
+      {
+          // all input is 1.
+          "de8556204999d6e4bf74cfdde61a095b",
+          "e7d0f4ce6df81c46de95da7790a67384",
+          "e7d0f4ce6df81c46de95da7790a67384",
+      },
+      {
+          // all input is 2048.
+          "83d600a7b3dc9bc3f710668ee2244e6b",
+          "468eec1453edc1befeb8a346f61950a7",
+          "468eec1453edc1befeb8a346f61950a7",
+      },
+      {
+          // all input is 4095.
+          "30bdb1dfee2b02b12b38e6b9f6287e27",
+          "34d673f075d2caa93a2f648ee3569e20",
+          "34d673f075d2caa93a2f648ee3569e20",
+      },
+      {
+          // random input.
+          "f10f21f5322231d991550fce7ef9787d",
+          "a2d8b6140bd5002e86644ef433b8eb42",
+          "a2d8b6140bd5002e86644ef433b8eb42",
+      },
+  };
+  return kDigestSuperRes[id][plane];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+// This type is used to parameterize the tests so is defined outside the
+// anonymous namespace to avoid the GCC -Wsubobject-linkage warning.
+struct FrameSizeParam {
+  FrameSizeParam(uint32_t width, uint32_t upscaled_width, uint32_t height,
+                 int8_t ss_x, int8_t ss_y)
+      : width(width),
+        upscaled_width(upscaled_width),
+        height(height),
+        subsampling_x(ss_x),
+        subsampling_y(ss_y) {}
+  uint32_t width;
+  uint32_t upscaled_width;
+  uint32_t height;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+};
+
+// Print operators must be defined in the same namespace as the type for the
+// lookup to work correctly.
+static std::ostream& operator<<(std::ostream& os, const FrameSizeParam& param) {
+  return os << param.width << "x" << param.height
+            << ", upscaled_width: " << param.upscaled_width
+            << ", subsampling(x/y): " << static_cast<int>(param.subsampling_x)
+            << "/" << static_cast<int>(param.subsampling_y);
+}
+
+// Note the following test classes access private functions/members of
+// PostFilter. To be declared friends of PostFilter they must not have internal
+// linkage (they must be outside the anonymous namespace).
+template <int bitdepth, typename Pixel>
+class PostFilterTestBase : public testing::TestWithParam<FrameSizeParam> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  PostFilterTestBase() = default;
+  PostFilterTestBase(const PostFilterTestBase&) = delete;
+  PostFilterTestBase& operator=(const PostFilterTestBase&) = delete;
+  ~PostFilterTestBase() override = default;
+
+  void SetUp() override {
+    // Allocate buffer_ with a border size of kBorderPixels (which is
+    // subsampled for chroma planes). Some tests (for loop restoration) only use
+    // the nearest 2 or 3 pixels (for both luma and chroma planes) in the
+    // border.
+    ASSERT_TRUE(buffer_.Realloc(
+        bitdepth, /*is_monochrome=*/false, frame_size_.upscaled_width,
+        frame_size_.height, frame_size_.subsampling_x,
+        frame_size_.subsampling_y, kBorderPixels, kBorderPixels, kBorderPixels,
+        kBorderPixels, nullptr, nullptr, nullptr));
+
+    ASSERT_TRUE(loop_restoration_border_.Realloc(
+        bitdepth, /*is_monochrome=*/false, frame_size_.upscaled_width,
+        frame_size_.height, frame_size_.subsampling_x,
+        frame_size_.subsampling_y, kBorderPixels, kBorderPixels, kBorderPixels,
+        kBorderPixels, nullptr, nullptr, nullptr));
+
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      const int8_t subsampling_x =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+      const int8_t subsampling_y =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+      width_[plane] = frame_size_.width >> subsampling_x;
+      upscaled_width_[plane] = frame_size_.upscaled_width >> subsampling_x;
+      stride_[plane] =
+          (frame_size_.upscaled_width + 2 * kBorderPixels) >> subsampling_x;
+      height_[plane] =
+          (frame_size_.height + 2 * kBorderPixels) >> subsampling_y;
+
+      reference_buffer_[plane].reserve(stride_[plane] * height_[plane]);
+      reference_buffer_[plane].resize(stride_[plane] * height_[plane]);
+      std::fill(reference_buffer_[plane].begin(),
+                reference_buffer_[plane].end(), 0);
+    }
+  }
+
+ protected:
+  YuvBuffer buffer_;
+  YuvBuffer cdef_border_;
+  YuvBuffer loop_restoration_border_;
+  uint32_t width_[kMaxPlanes];
+  uint32_t upscaled_width_[kMaxPlanes];
+  uint32_t stride_[kMaxPlanes];
+  uint32_t height_[kMaxPlanes];
+  std::vector<Pixel> reference_buffer_[kMaxPlanes];
+  const FrameSizeParam frame_size_ = GetParam();
+};
+
+template <int bitdepth, typename Pixel>
+class PostFilterHelperFuncTest : public PostFilterTestBase<bitdepth, Pixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  PostFilterHelperFuncTest() = default;
+  PostFilterHelperFuncTest(const PostFilterHelperFuncTest&) = delete;
+  PostFilterHelperFuncTest& operator=(const PostFilterHelperFuncTest&) = delete;
+  ~PostFilterHelperFuncTest() override = default;
+
+ protected:
+  using PostFilterTestBase<bitdepth, Pixel>::buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::cdef_border_;
+  using PostFilterTestBase<bitdepth, Pixel>::loop_restoration_border_;
+  using PostFilterTestBase<bitdepth, Pixel>::width_;
+  using PostFilterTestBase<bitdepth, Pixel>::upscaled_width_;
+  using PostFilterTestBase<bitdepth, Pixel>::stride_;
+  using PostFilterTestBase<bitdepth, Pixel>::height_;
+  using PostFilterTestBase<bitdepth, Pixel>::reference_buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::frame_size_;
+
+  void SetUp() override {
+    PostFilterTestBase<bitdepth, Pixel>::SetUp();
+
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      const int8_t subsampling_x =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+      const int8_t subsampling_y =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+      width_[plane] = frame_size_.width >> subsampling_x;
+      upscaled_width_[plane] = frame_size_.upscaled_width >> subsampling_x;
+      stride_[plane] = (frame_size_.upscaled_width >> subsampling_x) +
+                       2 * kRestorationHorizontalBorder;
+      height_[plane] = (frame_size_.height >> subsampling_y) +
+                       2 * kRestorationVerticalBorder;
+      reference_buffer_[plane].reserve(stride_[plane] * height_[plane]);
+      reference_buffer_[plane].resize(stride_[plane] * height_[plane]);
+      std::fill(reference_buffer_[plane].begin(),
+                reference_buffer_[plane].end(), 0);
+      buffer_border_corner_[plane] =
+          reinterpret_cast<Pixel*>(buffer_.data(plane)) -
+          buffer_.stride(plane) / sizeof(Pixel) * kRestorationVerticalBorder -
+          kRestorationHorizontalBorder;
+      loop_restoration_border_corner_[plane] =
+          reinterpret_cast<Pixel*>(loop_restoration_border_.data(plane)) -
+          loop_restoration_border_.stride(plane) / sizeof(Pixel) *
+              kRestorationVerticalBorder -
+          kRestorationHorizontalBorder;
+    }
+  }
+
+  void TestExtendFrame(bool use_fixed_values, Pixel value);
+  void TestAdjustFrameBufferPointer();
+  void TestPrepareLoopRestorationBlock();
+
+  // Fill the frame buffer with either a fixed value, or random values.
+  // If fill in with random values, make special operations at buffer
+  // boundaries. Make the outer most 3 pixel wide borders the same value
+  // as their immediate inner neighbor. For example:
+  // 4 4 4   4 5 6   6 6 6
+  // 4 4 4   4 5 6   6 6 6
+  // 4 4 4   4 5 6   6 6 6
+  //       ---------
+  // 4 4 4 | 4 5 6 | 6 6 6
+  // 1 1 1 | 1 0 1 | 1 1 1
+  // 0 0 0 | 0 1 0 | 0 0 0
+  // 1 1 1 | 1 0 1 | 1 1 1
+  // 0 0 0 | 0 1 0 | 0 0 0
+  // 6 6 6 | 6 5 4 | 4 4 4
+  //        -------
+  // 6 6 6   6 5 4   4 4 4
+  // 6 6 6   6 5 4   4 4 4
+  // 6 6 6   6 5 4   4 4 4
+  // Pixels within box is the current block. Outside is extended area from it.
+  void FillBuffer(bool use_fixed_values, Pixel value);
+
+  // Points to the upper left corner of the restoration border in buffer_.
+  Pixel* buffer_border_corner_[kMaxPlanes];
+  // Points to the upper left corner of the restoration border in
+  // loop_restoration_border_.
+  Pixel* loop_restoration_border_corner_[kMaxPlanes];
+};
+
+template <int bitdepth, typename Pixel>
+void PostFilterHelperFuncTest<bitdepth, Pixel>::FillBuffer(
+    bool use_fixed_values, Pixel value) {
+  if (use_fixed_values) {
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      // Fill buffer with a fixed value.
+      std::fill(reference_buffer_[plane].begin(),
+                reference_buffer_[plane].end(), value);
+      // Fill frame buffer. Note that the border is not filled.
+      auto* row = reinterpret_cast<Pixel*>(buffer_.data(plane));
+      for (int i = 0; i < buffer_.height(plane); ++i) {
+        std::fill(row, row + width_[plane], value);
+        row += buffer_.stride(plane) / sizeof(Pixel);
+      }
+    }
+  } else {  // Random value.
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int mask = (1 << bitdepth) - 1;
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      // Fill buffer with random values.
+      std::vector<Pixel> line_buffer(stride_[plane]);
+      std::fill(line_buffer.begin(), line_buffer.end(), 0);
+      for (int i = kRestorationHorizontalBorder;
+           i < stride_[plane] - kRestorationHorizontalBorder; ++i) {
+        line_buffer[i] = rnd.Rand16() & mask;
+      }
+      // Copy boundary values to extended border.
+      for (int i = 0; i < kRestorationHorizontalBorder; ++i) {
+        line_buffer[i] = line_buffer[kRestorationHorizontalBorder];
+        line_buffer[stride_[plane] - i - 1] =
+            line_buffer[stride_[plane] - 1 - kRestorationHorizontalBorder];
+      }
+      // The first three rows are the same as the line_buffer.
+      for (int i = 0; i < kRestorationVerticalBorder + 1; ++i) {
+        std::copy(line_buffer.begin(), line_buffer.end(),
+                  reference_buffer_[plane].begin() + i * stride_[plane]);
+      }
+      for (int i = kRestorationVerticalBorder + 1;
+           i < height_[plane] - kRestorationVerticalBorder; ++i) {
+        for (int j = kRestorationHorizontalBorder;
+             j < stride_[plane] - kRestorationHorizontalBorder; ++j) {
+          line_buffer[j] = rnd.Rand16() & mask;
+        }
+        for (int j = 0; j < kRestorationHorizontalBorder; ++j) {
+          line_buffer[j] = line_buffer[kRestorationHorizontalBorder];
+          line_buffer[stride_[plane] - j - 1] =
+              line_buffer[stride_[plane] - 1 - kRestorationHorizontalBorder];
+        }
+        std::copy(line_buffer.begin(), line_buffer.end(),
+                  reference_buffer_[plane].begin() + i * stride_[plane]);
+      }
+      // The extended border are the same as the line_buffer.
+      for (int i = 0; i < kRestorationVerticalBorder; ++i) {
+        std::copy(line_buffer.begin(), line_buffer.end(),
+                  reference_buffer_[plane].begin() +
+                      (height_[plane] - kRestorationVerticalBorder + i) *
+                          stride_[plane]);
+      }
+
+      // Fill frame buffer. Note that the border is not filled.
+      for (int i = 0; i < buffer_.height(plane); ++i) {
+        memcpy(buffer_.data(plane) + i * buffer_.stride(plane),
+               reference_buffer_[plane].data() + kRestorationHorizontalBorder +
+                   (i + kRestorationVerticalBorder) * stride_[plane],
+               sizeof(Pixel) * width_[plane]);
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterHelperFuncTest<bitdepth, Pixel>::TestExtendFrame(
+    bool use_fixed_values, Pixel value) {
+  ObuFrameHeader frame_header = {};
+  frame_header.upscaled_width = frame_size_.upscaled_width;
+  frame_header.width = frame_size_.width;
+  frame_header.height = frame_size_.height;
+  ObuSequenceHeader sequence_header;
+  sequence_header.color_config.bitdepth = bitdepth;
+  sequence_header.color_config.is_monochrome = false;
+  sequence_header.color_config.subsampling_x = frame_size_.subsampling_x;
+  sequence_header.color_config.subsampling_y = frame_size_.subsampling_y;
+
+  const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+  ASSERT_NE(dsp, nullptr);
+  FrameScratchBuffer frame_scratch_buffer;
+
+  PostFilter post_filter(frame_header, sequence_header, &frame_scratch_buffer,
+                         &buffer_, dsp,
+                         /*do_post_filter_mask=*/0x00);
+  FillBuffer(use_fixed_values, value);
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    const int plane_width =
+        plane == kPlaneY ? frame_header.upscaled_width
+                         : frame_header.upscaled_width >>
+                               sequence_header.color_config.subsampling_x;
+    const int plane_height =
+        plane == kPlaneY
+            ? frame_header.height
+            : frame_header.height >> sequence_header.color_config.subsampling_y;
+    PostFilter::ExtendFrame<Pixel>(
+        reinterpret_cast<Pixel*>(buffer_.data(plane)), plane_width,
+        plane_height, buffer_.stride(plane) / sizeof(Pixel),
+        kRestorationHorizontalBorder, kRestorationHorizontalBorder,
+        kRestorationVerticalBorder, kRestorationVerticalBorder);
+    const bool success = test_utils::CompareBlocks<Pixel>(
+        buffer_border_corner_[plane], reference_buffer_[plane].data(),
+        stride_[plane], height_[plane], buffer_.stride(plane) / sizeof(Pixel),
+        stride_[plane], /*check_padding=*/false, /*print_diff=*/false);
+    ASSERT_TRUE(success) << "Failure of extend frame at plane: " << plane;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+class PostFilterSuperResTest : public PostFilterTestBase<bitdepth, Pixel> {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  PostFilterSuperResTest() {
+    test_utils::ResetDspTable(bitdepth);
+    dsp::SuperResInit_C();
+    dsp::SuperResInit_SSE4_1();
+    dsp::SuperResInit_NEON();
+  }
+  PostFilterSuperResTest(const PostFilterSuperResTest&) = delete;
+  PostFilterSuperResTest& operator=(const PostFilterSuperResTest&) = delete;
+  ~PostFilterSuperResTest() override = default;
+
+ protected:
+  using PostFilterTestBase<bitdepth, Pixel>::buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::width_;
+  using PostFilterTestBase<bitdepth, Pixel>::upscaled_width_;
+  using PostFilterTestBase<bitdepth, Pixel>::stride_;
+  using PostFilterTestBase<bitdepth, Pixel>::height_;
+  using PostFilterTestBase<bitdepth, Pixel>::reference_buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::frame_size_;
+
+  void TestApplySuperRes(bool use_fixed_values, Pixel value, int id,
+                         bool multi_threaded);
+};
+
+// This class must be in namespace libgav1 to access private member function
+// of class PostFilter in src/post_filter.h.
+template <int bitdepth, typename Pixel>
+void PostFilterSuperResTest<bitdepth, Pixel>::TestApplySuperRes(
+    bool use_fixed_values, Pixel value, int id, bool multi_threaded) {
+  ObuFrameHeader frame_header = {};
+  frame_header.width = frame_size_.width;
+  frame_header.upscaled_width = frame_size_.upscaled_width;
+  frame_header.height = frame_size_.height;
+  frame_header.rows4x4 = DivideBy4(frame_size_.height);
+  frame_header.columns4x4 = DivideBy4(frame_size_.width);
+  frame_header.tile_info.tile_count = 1;
+  ObuSequenceHeader sequence_header;
+  sequence_header.color_config.bitdepth = bitdepth;
+  sequence_header.color_config.is_monochrome = false;
+  sequence_header.color_config.subsampling_x = frame_size_.subsampling_x;
+  sequence_header.color_config.subsampling_y = frame_size_.subsampling_y;
+
+  // Apply SuperRes.
+  Array2D<int16_t> cdef_index;
+  Array2D<TransformSize> inter_transform_sizes;
+  const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+  ASSERT_NE(dsp, nullptr);
+  constexpr int kNumThreads = 4;
+  FrameScratchBuffer frame_scratch_buffer;
+  if (multi_threaded) {
+    ASSERT_TRUE(frame_scratch_buffer.threading_strategy.Reset(frame_header,
+                                                              kNumThreads));
+  }
+  const int pixel_size = sequence_header.color_config.bitdepth == 8
+                             ? sizeof(uint8_t)
+                             : sizeof(uint16_t);
+  ASSERT_TRUE(frame_scratch_buffer.superres_coefficients[kPlaneTypeY].Resize(
+      kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) *
+      pixel_size));
+  if (!sequence_header.color_config.is_monochrome &&
+      sequence_header.color_config.subsampling_x != 0) {
+    ASSERT_TRUE(frame_scratch_buffer.superres_coefficients[kPlaneTypeUV].Resize(
+        kSuperResFilterTaps *
+        Align(SubsampledValue(frame_header.upscaled_width, 1), 16) *
+        pixel_size));
+  }
+  ASSERT_TRUE(frame_scratch_buffer.superres_line_buffer.Realloc(
+      sequence_header.color_config.bitdepth,
+      sequence_header.color_config.is_monochrome,
+      MultiplyBy4(frame_header.columns4x4), (multi_threaded ? kNumThreads : 1),
+      sequence_header.color_config.subsampling_x,
+      /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+      2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+      nullptr, nullptr, nullptr));
+  PostFilter post_filter(frame_header, sequence_header, &frame_scratch_buffer,
+                         &buffer_, dsp,
+                         /*do_post_filter_mask=*/0x04);
+
+  const int num_planes = sequence_header.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  int width[kMaxPlanes];
+  int upscaled_width[kMaxPlanes];
+  int height[kMaxPlanes];
+
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    const int8_t subsampling_x =
+        (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+    const int8_t subsampling_y =
+        (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+    width[plane] = frame_size_.width >> subsampling_x;
+    upscaled_width[plane] = frame_size_.upscaled_width >> subsampling_x;
+    height[plane] = frame_size_.height >> subsampling_y;
+    if (use_fixed_values) {
+      auto* src = reinterpret_cast<Pixel*>(post_filter.cdef_buffer_[plane]);
+      for (int y = 0; y < height[plane]; ++y) {
+        for (int x = 0; x < width[plane]; ++x) {
+          src[x] = value;
+        }
+        src += buffer_.stride(plane) / sizeof(Pixel);
+      }
+    } else {  // Random input.
+      const int mask = (1 << bitdepth) - 1;
+      libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+      auto* src = reinterpret_cast<Pixel*>(post_filter.cdef_buffer_[plane]);
+      for (int y = 0; y < height[plane]; ++y) {
+        for (int x = 0; x < width[plane]; ++x) {
+          src[x] = rnd.Rand16() & mask;
+        }
+        src += buffer_.stride(plane) / sizeof(Pixel);
+      }
+    }
+  }
+
+  if (multi_threaded) {
+    post_filter.ApplySuperResThreaded();
+  } else {
+    std::array<uint8_t*, kMaxPlanes> buffers = {
+        post_filter.cdef_buffer_[kPlaneY], post_filter.cdef_buffer_[kPlaneU],
+        post_filter.cdef_buffer_[kPlaneV]};
+    std::array<uint8_t*, kMaxPlanes> dst = {
+        post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneY), 0, 0),
+        post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneU), 0, 0),
+        post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneV), 0, 0)};
+    std::array<int, kMaxPlanes> rows = {
+        frame_header.rows4x4 * 4,
+        (frame_header.rows4x4 * 4) >> frame_size_.subsampling_y,
+        (frame_header.rows4x4 * 4) >> frame_size_.subsampling_y};
+    post_filter.ApplySuperRes(buffers, rows, /*line_buffer_row=*/-1, dst);
+  }
+
+  // Check md5.
+  std::vector<Pixel> output;
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    output.reserve(upscaled_width[plane] * height[plane]);
+    output.resize(upscaled_width[plane] * height[plane]);
+    auto* dst = reinterpret_cast<Pixel*>(
+        post_filter.GetSuperResBuffer(static_cast<Plane>(plane), 0, 0));
+    for (int y = 0; y < height[plane]; ++y) {
+      for (int x = 0; x < upscaled_width[plane]; ++x) {
+        output[y * upscaled_width[plane] + x] = dst[x];
+      }
+      dst += buffer_.stride(plane) / sizeof(Pixel);
+    }
+    const std::string digest = test_utils::GetMd5Sum(
+        output.data(), upscaled_width[plane] * height[plane] * sizeof(Pixel));
+    printf("MD5: %s\n", digest.c_str());
+    const char* expected_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        expected_digest = GetSuperResDigest8bpp(id, plane);
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        expected_digest = GetSuperResDigest10bpp(id, plane);
+        break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+      case 12:
+        expected_digest = GetSuperResDigest12bpp(id, plane);
+        break;
+#endif
+    }
+    ASSERT_NE(expected_digest, nullptr);
+    EXPECT_STREQ(digest.c_str(), expected_digest);
+  }
+}
+
+using PostFilterSuperResTest8bpp = PostFilterSuperResTest<8, uint8_t>;
+
+const FrameSizeParam kTestParamSuperRes[] = {
+    FrameSizeParam(176, 352, 288, 1, 1)};
+
+TEST_P(PostFilterSuperResTest8bpp, ApplySuperRes) {
+  TestApplySuperRes(true, 0, 0, false);
+  TestApplySuperRes(true, 1, 1, false);
+  TestApplySuperRes(true, 128, 2, false);
+  TestApplySuperRes(true, 255, 3, false);
+  TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest8bpp, ApplySuperResThreaded) {
+  TestApplySuperRes(true, 0, 0, true);
+  TestApplySuperRes(true, 1, 1, true);
+  TestApplySuperRes(true, 128, 2, true);
+  TestApplySuperRes(true, 255, 3, true);
+  TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+                         PostFilterSuperResTest8bpp,
+                         testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest8bpp = PostFilterHelperFuncTest<8, uint8_t>;
+
+const FrameSizeParam kTestParamExtendFrame[] = {
+    FrameSizeParam(16, 16, 16, 1, 1),
+    FrameSizeParam(64, 64, 64, 1, 1),
+    FrameSizeParam(128, 128, 64, 1, 1),
+    FrameSizeParam(64, 64, 128, 1, 1),
+    FrameSizeParam(352, 352, 288, 1, 1),
+    FrameSizeParam(720, 720, 480, 1, 1),
+    FrameSizeParam(1080, 1080, 720, 1, 1),
+    FrameSizeParam(16, 16, 16, 0, 0),
+    FrameSizeParam(64, 64, 64, 0, 0),
+    FrameSizeParam(128, 128, 64, 0, 0),
+    FrameSizeParam(64, 64, 128, 0, 0),
+    FrameSizeParam(352, 352, 288, 0, 0),
+    FrameSizeParam(720, 720, 480, 0, 0),
+    FrameSizeParam(1080, 1080, 720, 0, 0)};
+
+TEST_P(PostFilterHelperFuncTest8bpp, ExtendFrame) {
+  TestExtendFrame(true, 0);
+  TestExtendFrame(true, 1);
+  TestExtendFrame(true, 128);
+  TestExtendFrame(true, 255);
+  TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+                         PostFilterHelperFuncTest8bpp,
+                         testing::ValuesIn(kTestParamExtendFrame));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using PostFilterSuperResTest10bpp = PostFilterSuperResTest<10, uint16_t>;
+
+TEST_P(PostFilterSuperResTest10bpp, ApplySuperRes) {
+  TestApplySuperRes(true, 0, 0, false);
+  TestApplySuperRes(true, 1, 1, false);
+  TestApplySuperRes(true, 1 << 9, 2, false);
+  TestApplySuperRes(true, (1 << 10) - 1, 3, false);
+  TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest10bpp, ApplySuperResThreaded) {
+  TestApplySuperRes(true, 0, 0, true);
+  TestApplySuperRes(true, 1, 1, true);
+  TestApplySuperRes(true, 1 << 9, 2, true);
+  TestApplySuperRes(true, (1 << 10) - 1, 3, true);
+  TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+                         PostFilterSuperResTest10bpp,
+                         testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest10bpp = PostFilterHelperFuncTest<10, uint16_t>;
+
+TEST_P(PostFilterHelperFuncTest10bpp, ExtendFrame) {
+  TestExtendFrame(true, 0);
+  TestExtendFrame(true, 1);
+  TestExtendFrame(true, 255);
+  TestExtendFrame(true, (1 << 10) - 1);
+  TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+                         PostFilterHelperFuncTest10bpp,
+                         testing::ValuesIn(kTestParamExtendFrame));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using PostFilterSuperResTest12bpp = PostFilterSuperResTest<12, uint16_t>;
+
+TEST_P(PostFilterSuperResTest12bpp, ApplySuperRes) {
+  TestApplySuperRes(true, 0, 0, false);
+  TestApplySuperRes(true, 1, 1, false);
+  TestApplySuperRes(true, 1 << 11, 2, false);
+  TestApplySuperRes(true, (1 << 12) - 1, 3, false);
+  TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest12bpp, ApplySuperResThreaded) {
+  TestApplySuperRes(true, 0, 0, true);
+  TestApplySuperRes(true, 1, 1, true);
+  TestApplySuperRes(true, 1 << 11, 2, true);
+  TestApplySuperRes(true, (1 << 12) - 1, 3, true);
+  TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+                         PostFilterSuperResTest12bpp,
+                         testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest12bpp = PostFilterHelperFuncTest<12, uint16_t>;
+
+TEST_P(PostFilterHelperFuncTest12bpp, ExtendFrame) {
+  TestExtendFrame(true, 0);
+  TestExtendFrame(true, 1);
+  TestExtendFrame(true, 255);
+  TestExtendFrame(true, (1 << 12) - 1);
+  TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+                         PostFilterHelperFuncTest12bpp,
+                         testing::ValuesIn(kTestParamExtendFrame));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+namespace {
+
+const char* GetDigestApplyCdef8bpp(int id) {
+  static const char* const kDigest[] = {
+      "9593af24f9c6faecce53437f6e128edf", "ecb633cc2ecd6e7e0cf39d4439f4a6ea",
+      "9ec4cb4124f0a686a7bda72b447f5b8e", "7ebd859a23162bc864a69dbea60bc687",
+      "de7a15fc00664692a794aa68cf695980", "cf3fc8fe041f68d31ab4e34ad3643541",
+      "94c116b191b0268cf7ab4a0e6996e1ec", "1ad60c943a5a914aba7bc26706620a05",
+      "ce33c6f80e3608c4d18c49be2e393c20", "e140586ffc663798b74b8f6fb5b44736",
+      "b7379bba8bcb97f09a74655f4e0eee91", "02ce174061c98babd3987461b3984e47",
+      "64655dd1dfba8317e27d2fdcb211b7b4", "eeb6a61c70c5ee75a4c31dc5099b4dfb",
+      "ee944b31148fa2e30938084f7c046464", "db7b63497750fa4c51cf45c56a2da01c",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigestApplyCdef10bpp(int id) {
+  static const char* const kDigest[] = {
+      "53f8d68ac7f3aea65151b2066f8501c9", "021e70d5406fa182dd9713380eb66d1d",
+      "bab1c84e7f06b87d81617d2d0a194b89", "58e302ff0522f64901909fb97535b270",
+      "5ff95a6a798eadc7207793c03d898ce4", "1483d28cc0f1bfffedd1128966719aa0",
+      "6af5a36890b465ae962c2878af874f70", "bd1ed4a2ff09d323ab98190d1805a010",
+      "5ff95a6a798eadc7207793c03d898ce4", "1483d28cc0f1bfffedd1128966719aa0",
+      "6af5a36890b465ae962c2878af874f70", "bd1ed4a2ff09d323ab98190d1805a010",
+      "6f0299645cd6f0655fd26044cd43a37c", "56d7febf5bbebdc82e8f157ab926a0bb",
+      "f54654f11006453f496be5883216a3bb", "9abc6e3230792ba78bcc65504a62075e",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigestApplyCdef12bpp(int id) {
+  static const char* const kDigest[] = {
+      "06e2d09b6ce3924f3b5d4c00ab76eea5", "287240e4b13cb75e17932a3dd7ba3b3c",
+      "265da123e3347c4fb3e434f26a3949e7", "e032ce6eb76242df6894482ac6688406",
+      "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a",
+      "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b",
+      "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a",
+      "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b",
+      "155dd4283f8037f86cce34b6cfe67a7e", "0a022c70ead199517af9bad2002d70cd",
+      "a966dfea52a7a2084545f68b2c9e1735", "e098438a23a7c9f276e594b98b2db922",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace
+
+template <int bitdepth, typename Pixel>
+class PostFilterApplyCdefTest : public testing::TestWithParam<FrameSizeParam>,
+                                public test_utils::MaxAlignedAllocable {
+ public:
+  static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+  PostFilterApplyCdefTest() = default;
+  PostFilterApplyCdefTest(const PostFilterApplyCdefTest&) = delete;
+  PostFilterApplyCdefTest& operator=(const PostFilterApplyCdefTest&) = delete;
+  ~PostFilterApplyCdefTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    dsp::CdefInit_C();
+    dsp::CdefInit_SSE4_1();
+    dsp::CdefInit_NEON();
+
+    dsp_ = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp_, nullptr);
+  }
+
+  // Sets sequence_header_, frame_header_, cdef_index_ and cdef_skip_.
+  // Allocates yuv_buffer_ but does not set it.
+  void SetInput(libvpx_test::ACMRandom* rnd);
+  // Sets yuv_buffer_.
+  void SetInputBuffer(libvpx_test::ACMRandom* rnd, PostFilter* post_filter);
+  void CopyFilterOutputToDestBuffer();
+  void TestMultiThread(int num_threads);
+
+  ObuSequenceHeader sequence_header_;
+  ObuFrameHeader frame_header_ = {};
+  FrameScratchBuffer frame_scratch_buffer_;
+  YuvBuffer yuv_buffer_;
+  const dsp::Dsp* dsp_;
+  FrameSizeParam param_ = GetParam();
+  Pixel dest_[kMaxTestFrameSize * kMaxPlanes];
+  const size_t y_size_ = param_.width * param_.height;
+  const size_t uv_size_ = y_size_ >>
+                          (param_.subsampling_x + param_.subsampling_y);
+  const size_t size_ = y_size_ + uv_size_ * 2;
+};
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::SetInput(
+    libvpx_test::ACMRandom* rnd) {
+  sequence_header_.color_config.bitdepth = bitdepth;
+  sequence_header_.color_config.subsampling_x = param_.subsampling_x;
+  sequence_header_.color_config.subsampling_y = param_.subsampling_y;
+  sequence_header_.color_config.is_monochrome = false;
+  sequence_header_.use_128x128_superblock =
+      static_cast<bool>(rnd->Rand16() & 1);
+
+  ASSERT_TRUE(param_.width <= param_.upscaled_width);
+  ASSERT_TRUE(param_.upscaled_width * param_.height <= kMaxTestFrameSize)
+      << "Please adjust the max frame size.";
+
+  frame_header_.width = param_.width;
+  frame_header_.upscaled_width = param_.upscaled_width;
+  frame_header_.height = param_.height;
+  frame_header_.columns4x4 = DivideBy4(Align(frame_header_.width, 8));
+  frame_header_.rows4x4 = DivideBy4(Align(frame_header_.height, 8));
+  frame_header_.tile_info.tile_count = 1;
+  frame_header_.refresh_frame_flags = 0;
+  Cdef* const cdef = &frame_header_.cdef;
+  const int coeff_shift = bitdepth - 8;
+  do {
+    cdef->damping = (rnd->Rand16() & 3) + 3 + coeff_shift;
+    cdef->bits = rnd->Rand16() & 3;
+  } while (cdef->bits <= 0);
+  for (int i = 0; i < (1 << cdef->bits); ++i) {
+    cdef->y_primary_strength[i] = (rnd->Rand16() & 15) << coeff_shift;
+    cdef->y_secondary_strength[i] = rnd->Rand16() & 3;
+    if (cdef->y_secondary_strength[i] == 3) {
+      ++cdef->y_secondary_strength[i];
+    }
+    cdef->y_secondary_strength[i] <<= coeff_shift;
+    cdef->uv_primary_strength[i] = (rnd->Rand16() & 15) << coeff_shift;
+    cdef->uv_secondary_strength[i] = rnd->Rand16() & 3;
+    if (cdef->uv_secondary_strength[i] == 3) {
+      ++cdef->uv_secondary_strength[i];
+    }
+    cdef->uv_secondary_strength[i] <<= coeff_shift;
+  }
+
+  const int rows64x64 = DivideBy16(frame_header_.rows4x4 + kMaxBlockHeight4x4);
+  const int columns64x64 =
+      DivideBy16(frame_header_.columns4x4 + kMaxBlockWidth4x4);
+  ASSERT_TRUE(frame_scratch_buffer_.cdef_index.Reset(rows64x64, columns64x64));
+  for (int row = 0; row < rows64x64; ++row) {
+    for (int column = 0; column < columns64x64; ++column) {
+      frame_scratch_buffer_.cdef_index[row][column] =
+          rnd->Rand16() & ((1 << cdef->bits) - 1);
+    }
+  }
+
+  const int skip_rows = DivideBy2(frame_header_.rows4x4 + kMaxBlockHeight4x4);
+  const int skip_columns =
+      DivideBy16(frame_header_.columns4x4 + kMaxBlockWidth4x4);
+  ASSERT_TRUE(frame_scratch_buffer_.cdef_skip.Reset(skip_rows, skip_columns));
+  for (int row = 0; row < skip_rows; ++row) {
+    memset(frame_scratch_buffer_.cdef_skip[row], 0xFF, skip_columns);
+  }
+
+  ASSERT_TRUE(yuv_buffer_.Realloc(
+      sequence_header_.color_config.bitdepth,
+      sequence_header_.color_config.is_monochrome, frame_header_.upscaled_width,
+      frame_header_.height, sequence_header_.color_config.subsampling_x,
+      sequence_header_.color_config.subsampling_y, kBorderPixels, kBorderPixels,
+      kBorderPixels, kBorderPixels, nullptr, nullptr, nullptr))
+      << "Failed to allocate source buffer.";
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::SetInputBuffer(
+    libvpx_test::ACMRandom* rnd, PostFilter* post_filter) {
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    const int subsampling_x = (plane == 0) ? 0 : param_.subsampling_x;
+    const int subsampling_y = (plane == 0) ? 0 : param_.subsampling_y;
+    const int plane_width =
+        MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+    const int plane_height =
+        MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+    auto* src =
+        reinterpret_cast<Pixel*>(post_filter->GetUnfilteredBuffer(plane));
+    const int src_stride = yuv_buffer_.stride(plane) / sizeof(src[0]);
+    for (int y = 0; y < plane_height; ++y) {
+      for (int x = 0; x < plane_width; ++x) {
+        src[x] = rnd->Rand16() & ((1 << bitdepth) - 1);
+      }
+      src += src_stride;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::CopyFilterOutputToDestBuffer() {
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    const int subsampling_x = (plane == 0) ? 0 : param_.subsampling_x;
+    const int subsampling_y = (plane == 0) ? 0 : param_.subsampling_y;
+    const int plane_width = SubsampledValue(param_.width, subsampling_x);
+    const int plane_height = SubsampledValue(param_.height, subsampling_y);
+    auto* src = reinterpret_cast<Pixel*>(yuv_buffer_.data(plane));
+    const int src_stride = yuv_buffer_.stride(plane) / sizeof(src[0]);
+    Pixel* dest_plane =
+        dest_ +
+        ((plane == 0) ? 0 : ((plane == 1) ? y_size_ : y_size_ + uv_size_));
+    for (int y = 0; y < plane_height; ++y) {
+      for (int x = 0; x < plane_width; ++x) {
+        dest_plane[y * plane_width + x] = src[x];
+      }
+      src += src_stride;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::TestMultiThread(
+    int num_threads) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  SetInput(&rnd);
+
+  ASSERT_TRUE(frame_scratch_buffer_.threading_strategy.Reset(frame_header_,
+                                                             num_threads));
+  if (num_threads > 1) {
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header_.rows4x4, 4));
+    ASSERT_TRUE(frame_scratch_buffer_.cdef_border.Realloc(
+        bitdepth, /*is_monochrome=*/false,
+        MultiplyBy4(frame_header_.columns4x4), num_units,
+        sequence_header_.color_config.subsampling_x,
+        /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+        kBorderPixels, nullptr, nullptr, nullptr));
+  }
+
+  PostFilter post_filter(frame_header_, sequence_header_,
+                         &frame_scratch_buffer_, &yuv_buffer_, dsp_,
+                         /*do_post_filter_mask=*/0x02);
+  SetInputBuffer(&rnd, &post_filter);
+
+  const int id = GetIdFromInputParam(param_.subsampling_x, param_.subsampling_y,
+                                     param_.height);
+  absl::Duration elapsed_time;
+  const absl::Time start = absl::Now();
+
+  // Only ApplyCdef() and frame copy inside ApplyFilteringThreaded() are
+  // triggered, since we set the filter mask to 0x02.
+  post_filter.ApplyFilteringThreaded();
+  elapsed_time += absl::Now() - start;
+
+  CopyFilterOutputToDestBuffer();
+  const char* expected_digest = nullptr;
+  switch (bitdepth) {
+    case 8:
+      expected_digest = GetDigestApplyCdef8bpp(id);
+      break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    case 10:
+      expected_digest = GetDigestApplyCdef10bpp(id);
+      break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+    case 12:
+      expected_digest = GetDigestApplyCdef12bpp(id);
+      break;
+#endif
+  }
+  ASSERT_NE(expected_digest, nullptr);
+  test_utils::CheckMd5Digest(kCdef, kApplyCdefName, expected_digest, dest_,
+                             size_, elapsed_time);
+}
+
+const FrameSizeParam kTestParamApplyCdef[] = {
+    FrameSizeParam(352, 352, 288, 0, 0),    FrameSizeParam(720, 720, 480, 0, 0),
+    FrameSizeParam(1920, 1920, 1080, 0, 0), FrameSizeParam(251, 251, 187, 0, 0),
+    FrameSizeParam(352, 352, 288, 0, 1),    FrameSizeParam(720, 720, 480, 0, 1),
+    FrameSizeParam(1920, 1920, 1080, 0, 1), FrameSizeParam(251, 251, 187, 0, 1),
+    FrameSizeParam(352, 352, 288, 1, 0),    FrameSizeParam(720, 720, 480, 1, 0),
+    FrameSizeParam(1920, 1920, 1080, 1, 0), FrameSizeParam(251, 251, 187, 1, 0),
+    FrameSizeParam(352, 352, 288, 1, 1),    FrameSizeParam(720, 720, 480, 1, 1),
+    FrameSizeParam(1920, 1920, 1080, 1, 1), FrameSizeParam(251, 251, 187, 1, 1),
+};
+
+using PostFilterApplyCdefTest8bpp = PostFilterApplyCdefTest<8, uint8_t>;
+
+TEST_P(PostFilterApplyCdefTest8bpp, ApplyCdef) {
+  TestMultiThread(2);
+  TestMultiThread(4);
+  TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+                         PostFilterApplyCdefTest8bpp,
+                         testing::ValuesIn(kTestParamApplyCdef));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using PostFilterApplyCdefTest10bpp = PostFilterApplyCdefTest<10, uint16_t>;
+
+TEST_P(PostFilterApplyCdefTest10bpp, ApplyCdef) {
+  TestMultiThread(2);
+  TestMultiThread(4);
+  TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+                         PostFilterApplyCdefTest10bpp,
+                         testing::ValuesIn(kTestParamApplyCdef));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using PostFilterApplyCdefTest12bpp = PostFilterApplyCdefTest<12, uint16_t>;
+
+TEST_P(PostFilterApplyCdefTest12bpp, ApplyCdef) {
+  TestMultiThread(2);
+  TestMultiThread(4);
+  TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+                         PostFilterApplyCdefTest12bpp,
+                         testing::ValuesIn(kTestParamApplyCdef));
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
+}  // namespace libgav1
diff --git a/src/prediction_mask.cc b/src/prediction_mask.cc
new file mode 100644 (file)
index 0000000..ab4d849
--- /dev/null
@@ -0,0 +1,236 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+  kWedgeHorizontal,
+  kWedgeVertical,
+  kWedgeOblique27,
+  kWedgeOblique63,
+  kWedgeOblique117,
+  kWedgeOblique153,
+};
+
+constexpr uint8_t kWedgeCodebook[3][16][3] = {{{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeHorizontal, 4, 2},
+                                               {kWedgeHorizontal, 4, 4},
+                                               {kWedgeHorizontal, 4, 6},
+                                               {kWedgeVertical, 4, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}},
+                                              {{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeVertical, 2, 4},
+                                               {kWedgeVertical, 4, 4},
+                                               {kWedgeVertical, 6, 4},
+                                               {kWedgeHorizontal, 4, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}},
+                                              {{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeHorizontal, 4, 2},
+                                               {kWedgeHorizontal, 4, 6},
+                                               {kWedgeVertical, 2, 4},
+                                               {kWedgeVertical, 6, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}}};
+
+constexpr BitMaskSet kWedgeFlipSignMasks[9] = {
+    BitMaskSet(0xBBFF),  // kBlock8x8
+    BitMaskSet(0xBBEF),  // kBlock8x16
+    BitMaskSet(0xBAEF),  // kBlock8x32
+    BitMaskSet(0xBBEF),  // kBlock16x8
+    BitMaskSet(0xBBFF),  // kBlock16x16
+    BitMaskSet(0xBBEF),  // kBlock16x32
+    BitMaskSet(0xABEF),  // kBlock32x8
+    BitMaskSet(0xBBEF),  // kBlock32x16
+    BitMaskSet(0xBBFF)   // kBlock32x32
+};
+
+// This table (and the one below) contains a few leading zeros and trailing 64s
+// to avoid some additional memcpys where it is actually used.
+constexpr uint8_t kWedgeMasterObliqueOdd[kWedgeMaskMasterSize * 3 / 2] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18, 37,
+    53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterObliqueEven[kWedgeMaskMasterSize * 3 / 2] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+    46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterVertical[kWedgeMaskMasterSize] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+    43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+int BlockShape(BlockSize block_size) {
+  const int width = kNum4x4BlocksWide[block_size];
+  const int height = kNum4x4BlocksHigh[block_size];
+  if (height > width) return 0;
+  if (height < width) return 1;
+  return 2;
+}
+
+uint8_t GetWedgeDirection(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][0];
+}
+
+uint8_t GetWedgeOffsetX(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][1];
+}
+
+uint8_t GetWedgeOffsetY(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][2];
+}
+
+}  // namespace
+
+bool GenerateWedgeMask(WedgeMaskArray* const wedge_masks) {
+  // Generate master masks.
+  uint8_t master_mask[6][kWedgeMaskMasterSize][kWedgeMaskMasterSize];
+  for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+    memcpy(master_mask[kWedgeVertical][y], kWedgeMasterVertical,
+           kWedgeMaskMasterSize);
+  }
+
+  for (int y = 0, shift = 0; y < kWedgeMaskMasterSize; y += 2, ++shift) {
+    memcpy(master_mask[kWedgeOblique63][y], kWedgeMasterObliqueEven + shift,
+           kWedgeMaskMasterSize);
+    memcpy(master_mask[kWedgeOblique63][y + 1], kWedgeMasterObliqueOdd + shift,
+           kWedgeMaskMasterSize);
+  }
+
+  for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+    for (int x = 0; x < kWedgeMaskMasterSize; ++x) {
+      const uint8_t mask_value = master_mask[kWedgeOblique63][y][x];
+      master_mask[kWedgeHorizontal][x][y] = master_mask[kWedgeVertical][y][x];
+      master_mask[kWedgeOblique27][x][y] = mask_value;
+      master_mask[kWedgeOblique117][y][kWedgeMaskMasterSize - 1 - x] =
+          64 - mask_value;
+      master_mask[kWedgeOblique153][(kWedgeMaskMasterSize - 1 - x)][y] =
+          64 - mask_value;
+    }
+  }
+
+  // Generate wedge masks.
+  int block_size_index = 0;
+  for (int size = kBlock8x8; size <= kBlock32x32; ++size) {
+    if (!kIsWedgeCompoundModeAllowed.Contains(size)) continue;
+
+    const int width = kBlockWidthPixels[size];
+    const int height = kBlockHeightPixels[size];
+    assert(width >= 8);
+    assert(width <= 32);
+    assert(height >= 8);
+    assert(height <= 32);
+
+    const auto block_size = static_cast<BlockSize>(size);
+    for (int wedge_index = 0; wedge_index < kWedgeDirectionTypes;
+         ++wedge_index) {
+      const uint8_t direction = GetWedgeDirection(block_size, wedge_index);
+      const uint8_t offset_x =
+          DivideBy2(kWedgeMaskMasterSize) -
+          ((GetWedgeOffsetX(block_size, wedge_index) * width) >> 3);
+      const uint8_t offset_y =
+          DivideBy2(kWedgeMaskMasterSize) -
+          ((GetWedgeOffsetY(block_size, wedge_index) * height) >> 3);
+
+      // Allocate the 2d array.
+      for (int flip_sign = 0; flip_sign < 2; ++flip_sign) {
+        if (!((*wedge_masks)[block_size_index][flip_sign][wedge_index].Reset(
+                height, width, /*zero_initialize=*/false))) {
+          LIBGAV1_DLOG(ERROR, "Failed to allocate memory for wedge masks.");
+          return false;
+        }
+      }
+
+      const auto flip_sign = static_cast<uint8_t>(
+          kWedgeFlipSignMasks[block_size_index].Contains(wedge_index));
+      uint8_t* wedge_masks_row =
+          (*wedge_masks)[block_size_index][flip_sign][wedge_index][0];
+      uint8_t* wedge_masks_row_flip =
+          (*wedge_masks)[block_size_index][1 - flip_sign][wedge_index][0];
+      uint8_t* master_mask_row = &master_mask[direction][offset_y][offset_x];
+      for (int y = 0; y < height; ++y) {
+        memcpy(wedge_masks_row, master_mask_row, width);
+        for (int x = 0; x < width; ++x) {
+          wedge_masks_row_flip[x] = 64 - wedge_masks_row[x];
+        }
+        wedge_masks_row += width;
+        wedge_masks_row_flip += width;
+        master_mask_row += kWedgeMaskMasterSize;
+      }
+    }
+
+    block_size_index++;
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/prediction_mask.h b/src/prediction_mask.h
new file mode 100644 (file)
index 0000000..827a0fa
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_PREDICTION_MASK_H_
+#define LIBGAV1_SRC_PREDICTION_MASK_H_
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr BitMaskSet kIsWedgeCompoundModeAllowed(kBlock8x8, kBlock8x16,
+                                                 kBlock8x32, kBlock16x8,
+                                                 kBlock16x16, kBlock16x32,
+                                                 kBlock32x8, kBlock32x16,
+                                                 kBlock32x32);
+
+// This function generates wedge masks. It should be called only once for the
+// decoder. If the video is key frame only, we don't have to call this
+// function. Returns true on success, false on allocation failure.
+// 7.11.3.11.
+bool GenerateWedgeMask(WedgeMaskArray* wedge_masks);
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_PREDICTION_MASK_H_
diff --git a/src/prediction_mask_test.cc b/src/prediction_mask_test.cc
new file mode 100644 (file)
index 0000000..d2a12c2
--- /dev/null
@@ -0,0 +1,214 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+  kWedgeHorizontal,
+  kWedgeVertical,
+  kWedgeOblique27,
+  kWedgeOblique63,
+  kWedgeOblique117,
+  kWedgeOblique153,
+};
+
+const char* const kExpectedWedgeMask[] = {
+    "cea09e4bf4227efef749672283f7369b", "2763ab02b70447b2f9d5ed4796ca33bc",
+    "8d83c4315eadda824893c3e79aa866d9", "a733fd7f143c1c6141983c5f816bb3d8",
+    "9a205bfca776ccde57a8031350f2f467", "d78b964719f52f302f4454df14e45e35",
+    "bdc3972cfeb44d0acebb49b2fcb76072", "c8872571833c165be99ada1c552bfd9b",
+    "26d2541e2f8efe48e2f4a1819b3a6896", "783871179337e78e5ef41a66c0c6937c",
+    "253d21c612d732fceedcf610c4ff099c", "c868d177dc2a2378ef362fa482f601e8",
+    "782d75e143d87cc1aeb5d040c48d3c2d", "718cbecf4db45c7d596eba07bd956601",
+    "3b60b9336c2cf699172eb4a3fef18787", "afe72d4bd206f1cb27e3736c3b0068cf",
+    "7b830a1a94bad23a1df1b8d9668708d0", "d3f421ff2b81686fd421f7c02622aac1",
+    "d9ac14dff8e3c415e85e99c3ce0fbd5b", "da493727a08773a950a0375881d912f2",
+    "2f4251fd1b4636a034e22611ea1223b6", "84f84f01900b8a894b19e353605846b0",
+    "bbf5dae73300b6a6789710ffc4fc59fd", "c711941a0889fbed9b926c1eb39a5616",
+    "2fcf270613df57a57e647f37bf9a19ec", "79ed9c2f828b765edf65027f1f0847f5",
+    "e8d3e821f4e7f2f39659071da8f2cc71", "823bb09e2c28f2a81bf8a2d030e8bab6",
+    "d598fb4f70ea6b705674497994aecbfa", "3737c39f058c57650be7e720dcd87aa1",
+    "eb1d9b1d30485d9870ca9380cbdfad43", "a23d3c24f291080fcd62c0a2a2aea181",
+    "968543d91aeae3b1814a5074b6aa9e8c", "6e2444d71a4f3ddfe643e72f9c3cf6c3",
+    "3bf78413aa04830849a3d9c7bfa41a84", "ece8306f9859bcfb042b0bda8f6750b6",
+    "608b29fcedb7fa054a599945b497c78c", "d69d622016872469dfbde4e589bfd679",
+    "38a2307174c27b634323c59da3339dc6", "5e44f0fad99dbe802ffd69c7dc239d56",
+    "a0eeaf3755a724fdf6469f43cb060d75", "7bcf8035c5057619ea8660c32802d6a1",
+    "6054e1c35fe13b9269ab01d1bc0d8848", "e0ec8f7c66ebabff60f5accd3d707788",
+    "0b9fd6e1053a706af5d0cd59dc7e1992", "709648ffab1992d8522b04ca23de577a",
+    "c576e378ed264d6cb00adfd3b4e428f1", "f6f3ae5348e7141775a8a6bc2be22f80",
+    "9289722adb38fa3b2fb775648f0cc3a8", "b7e02fa00b56aeea8e6098a92eac72e1",
+    "db2f6d66ffca8352271f1e3f0116838a", "5858c567b0719daaa364fb0e6d8aa5dc",
+    "db2d300f875d2465adabf4c1322cea6f", "05c66b54c4d32e5b64a7e77e751f0c51",
+    "f2c2a5a3ce510d21ef2e62eedba85afb", "3959d2191a11e800289e21fd283b2837",
+    "cc86023d079a4c5daadce8ad0cdd176f", "e853f3c6814a653a52926488184aae5e",
+    "8568b9d7215bb8dfb1b7ce66ef38e055", "42814ac5ed652afb4734465cca9e038c",
+    "dba6b7d5e93e6a20dac9a514824ad45c", "be77e0dce733b564e96024ea23c9db43",
+    "2aa7bd75a1d8eb1000f0ef9e19aa0d1d", "226d85741e3f35493e971dd13b689ec7",
+    "9e5a0cf4416f8afeaa3ddbe686b5b7db", "18389c77b362f6b4b727b99426251159",
+    "10c5d899de999bbdf35839be3f2d5ee3", "942ae479a36fb4b4d359bebd78a92f03",
+    "f14e4dd174958e16755cd1f456b083e0", "8a036cbd0aaf1bece25a1140109f688b",
+    "2e48eade95f9fa0b7dae147e66d83e13", "4387d723350a011e26b0e91bbeb3d7c2",
+    "5470f977d859232335945efc8bb49ff1", "6780fd81cf2561300c75c930e715c7a6",
+    "9786aca6b1b9abfc3eae51404bc3cbd5", "da65c1440fa370a0237284bf30e56b0b",
+    "8e0d5d83ab3c477fd11ef143a832f7bf", "97489c7a47aa69fef091e7e6e4049a8f",
+    "28787beac9e69001c2999976742764a3", "67760c48ff5f7bc50cd92727694ba271",
+    "57c2b0b7de5de0f40fb739ed095d82a4", "7b2a663ca7da4b73f1adfc7e0ca1eff1",
+    "980869e1795efb63ca623ce2f0043fb3", "575497eb213b05bab24017cc6ea4e56a",
+    "ca3b31382439f0bdd87b61fa10c7863b", "72c65bf29afb288f4d4ff51816429aa7",
+    "1fe8929387be982993cd2309e3eeae7a", "994246e2585179e00f49537713f33796",
+    "82ae324ba01002370e918724ce452738", "fb3bcb4811b8251f0cc5ec40859617e7",
+    "a2e24b21c1d3661412e00411d719210c", "7adc2b60d7d62df1d07e3e4458a46dc2",
+    "e71c1b2f9ccb1af0868c3869dc296506", "3e33e087c7e6f724528abbc658a1b631",
+    "19b80d80f6b83eedac4bab6226865ae1", "7d9293641c4ed3b21c14964ec785cfb9",
+    "5dd0fb9700f30c25bf7b65367c8f098d", "f96b55ec2d012807c972ef4731acd73d",
+    "5fc70808c3fa5b3c511926b434bfba66", "768c3ce37acfcd4e5ba05152e5710bc9",
+    "1271a52682566ebfc01d5c239177ffd4", "52d4fc11a7507695b2548e0424be50ab",
+    "729e7d421aaaf74daa27b0ce1ca0a305", "92d2ff4a9a679cdf0ff765a2d30bced1",
+    "d160ec6f1bd864eb2ac8fabf5af7fedd", "ad323dbcb4a651e96bd5c81bc185385d",
+    "937c1b7106a2e6aef0adf2c858b4df18", "0f9ad42d1c48970f8462921ac79849ee",
+    "32ed1e1a16ddbf816f81caca7cb56c93", "e91aa6389d8255b7744aaa875ba2ceec",
+    "88f9dedf6d565b2f60b511e389cf366a", "d0428fd42ca311cd3680ff4670d4f047",
+    "b9c7eeb7c9733f0220587643952602cb", "65adf32a5e03d161a411815179078ba3",
+    "4984a4e9a5bdf732c071d5b60029daf4", "b9b65a2a9f04b59766d305221e4cda5a",
+    "7b2d372fe33d6db1fcf75820b7523ed5", "9a07593316707f8e59fe09c7647ade15",
+    "33e75e0d2aa73e3410095c2f98c27a14", "f9ddb33b16431ff9cf6ae96dd4acc792",
+    "2df1a8655b2ef23f642b11b76b20f557", "9faba399ccf555c25a33c336cdd54d94",
+    "c94404e263c2dae2e955ead645348c08", "3d16d4be87cd4467c3f7be17287940c8",
+    "99d0fdae81d61680c7a5b1df38dc98fc", "a23b402d699a00c5c349b17e77f73552",
+    "c6f76c81c4050939a6bd5d30ca00b307", "bc3d035bd6e8f55497bfc6d1f81fc8be",
+    "99b10db073e13b49bd90655f7516383b", "ddfd0e434efe076e2706c5669c788566",
+    "e1d836f814e6eca80ef530f8676e0599", "ed3e4c64e9fd1006e0016e460970a423",
+    "0282542e21fa0dea0bf48ec0a2d25b2d", "7482eb8a7bf1417a61c21d82bc7c95f9",
+    "e98e9bb3d5edf7b943d0bbf1eec9bef6", "ad4d313beecf609ff3a7d30da3e54a1d",
+    "b98f8db9fa62fb73d26415f6fa31b330", "0591b3c34bf4750f20a74eee165a54bd",
+    "3054b56fec6968255f21d40f80f5121c", "59ecf60cbb8408e042816e73446fa79c",
+    "8fa8c996209a1ddb8a00c14ca19953f8", "e20d2462bc43a1a1bfbc5efe7a905666",
+    "b5065e40d5d103e21daabcf4d5fea805", "b65aba0f8e307ef08951f1abdb7c8f62",
+    "5fbec6e57c1c651bd7be69fccb0b39a6", "9dfc362f7212d086418b0def54a7c76c",
+    "6644928e9aaac5e5d64f4a2c437c778a", "1bf63c7539ea32489bec222d5bc5305f",
+    "755ec607a5edf116d188353a96a025c3", "bdc4cc354c4f57c38d3be3dbc9380e2d",
+    "7851752b4ae36793ab6f03cd91e7ba6f", "99b9834ea2f6ea8d9168c5c1ba7fe790",
+    "75a155c83b618b28d48f5f343cdfef62", "38821c97e04d2294766699a6846fefaf",
+    "14be7f588461273862c9d9b83d2f6f0a", "8c38ce521671f0eee7e6f6349ef4f981",
+    "043347de994f2fe68c08e7c06a7f6735", "cda15ea2caccbdd8a7342a6144278578",
+    "244d586e88c9d6a9a59059a82c3b8e57", "3712928dd0dd77f027370f22d61366a0",
+    "e4f1cd4785fc331ad6e3100da4a934f3", "3181459434921b5b15b64cfd2ee734c4",
+    "2d588831e98c7178c5370421a6f2fc60", "135cf6a67fc1b51dbcf9fcddb3ae1237",
+    "d701da4e1a890a37bb0e9af4a2f0b048", "02138b5a4882181f248945c3a8262050",
+    "7fbd4d06965b1d152d6c037b0302f307", "7917a20573da241868689ed49c0d5972",
+    "ffdd4257d91fe00e61de4d2668f1ee07", "72999b6d3bf1ee189e9269a27105991f",
+    "1b63d7f25388c9af4adac60d46b7a8ca", "e3ce0977224197ade58aa979f3206d68",
+    "73178ffd388b46891fc4a0440686b554", "f1f99faf52cea98c825470c6edd1d973",
+    "e6fae5d5682862ec3377b714b6b69825", "a4f96cca8da155204b0cc4258b068d3c",
+    "75c7674c2356325dcb14c222266c46f8", "932b23521c9d9d06096879a665a14e28",
+    "8ed48a84a99b4a5bf2ec8a7a2c1f1c79", "4f6f0214857a92ad92eca1c33a762424",
+    "34865190c3e91200a0609a6e770ebc5c", "e793f1f2e46876b1e417da5d59475fda",
+    "e83cd9a228941a152f6878aa939e1290", "d6f5cd74ba386bd98282e1fcb0528dbd",
+    "131b55ec66ffe76f9088f7b35d38c0dd", "2d0ae8ee059cbd8c7816e3c862efdf37",
+    "65baadd2cb85ffbc6480bf8c1f128d1a", "2b8e8af333c464b4213bbd9185a9b751",
+    "951fd5faed77a1ae9bf5ef8f30bd65c3", "41d38d40dfe9da2b9ff2146711bf6ab5",
+    "7430bde28aed5a9429db54ea663a5e26", "46576d59a13756c494793ad4b3a663e5",
+    "21802d0db30caa44cbdba2ac84cc49b5", "591cad82ae106d9e9670acd5b60e4548",
+    "c0484c58c6c009939e7f3ec0c1aa8e2d", "6405c55d0a1830cfdd37950bfd65fd6f",
+    "3bd74c067d2ba027fc004e9bf62254db", "6e920e6dbdbe55a97ff2bf3dfb38a3e0",
+    "e2ed20f89da293516b14be766a624299", "0a613ee53ec38cad995faa17a24fcb8f",
+    "0de937145c030d766c3f9fff09d7e39c", "4a560325b804fcb6643866e971ade8e8",
+    "be82c41d3a0f8bd4032c3e5e45b453da", "b27219f02db167bf5a416831b908b031",
+    "7cf5437e25d362bc373dd53d8fd78186", "39c801e28cc08150c2016083113d1a03",
+    "785a21219d9c42a7c5bd417f365535a3", "008c79298a87837bcb504c4dc39ca628",
+    "af24d1d6f4d3ee94f2af52471a64ca1f", "cd82218aae9815c106336aec7ce18833",
+    "9f405c66d4ce7533213c4ca82feaf252", "7ceda4ea6ddeccd04dbf6d3237fe956a",
+    "ae21b52869b85a64fa4e3a85a2a8bb8d", "a004927cdbf48e0dafcccfb6066cdd0c",
+    "949337a963a8a5c0f46cf774b078a7cd", "24f58b8db17d02f66d04d22ca6c5e026",
+    "2b1315a2e7c5d5309a7621651e741616", "5b317ef820e6c8e7ea7a7d7022e8349d",
+    "debd504650d35d9deca4c2461094949f", "19d0ca33e5b3a0afff1f39f0f42238e0",
+    "df1c6c7582bfa5ceb147a8dd253cfa43", "176647077c5e2d985b3807134aac118f",
+    "dd2850172602688eaaa768f705c1ba67", "6ba1a3929ae9725fc688b8189b16314f",
+    "639189abb754dfa6be3c813ee8342954", "d5d1b8bff370f280fba13827d6bdf0fb",
+    "4b0ad4ea387a952724cab42730f712d2", "8c9c1f09946b61315e9a45c7e39f1992",
+    "50ef75c2b7a17f972586ce053eb62d24", "d5922dd01d8d02ca00ab9648a3db343f",
+    "091f517b18f4438ea9c581b7471f2fc0", "fede855bfb936caaa8fb4a434adac1d3",
+    "081b612f810f38c5ff6dc1cd03bf2eb6", "bd10e764eaf7d7e0ec89de96423d0afe",
+    "3e64cb1355e05b0a4b0237fae3f33bb2", "7cb92e0ecc0dd06d0a5d248efba48630",
+    "ec875f2e155a2e124ef52bf35e9a876c", "15529c83eae41bfa804f2c386f480e90",
+    "ee0e59567874155fb54de63fc901ded7", "4ad160b0d0f5166f9cddf7235725406e",
+    "176b64b3883c33e2aa251159983ccaa1", "d9cca01946d2a47c0114b1f49e4d688f",
+    "73d706a13afa279d9c716b3ba3a2ed68", "dea5a7f010d2f1385fe2b7d1d36aafb0",
+    "b5432fbc22d2f96c1230cc33178da09e", "8b0e7399ce98b68de4048411ab649468",
+    "3d52c986a5a5852a4620fbb38259a109", "eb61882738fefdd105094d4c104cf8b0",
+    "24fbc0d3ee28e937cfa1a3fbbc4e8214", "c69eb0687e477c27ac0d4c5fe54bbe8b",
+    "00a4f498f05b2b348252927ecc82c8a3", "c76471a61250be52e8d5933e582b1e19",
+    "22ebb8812dd795fdc14f20a7f9f89844", "f7c7d5c04bc234545726f4b116b623ec",
+    "9fc323d6619af0101edfacb4e9c2b647", "902d7888215d6aac1cf41f1fb6a916d8",
+    "5817d80a0504a5b08627502aeece4f38", "a1afa4b4065c143bc4857e364cec7f3d",
+    "506d5a6ff434411ea893bb2dc021aa25", "31cd3ca39015ccee1e217e1c83fff2a0",
+    "eb1ed4ef292c7d8fead1f113c9fd998f", "35f3abf3a056b778e3d7885f8df6c07a",
+    "299d71ee557382f5e64f26f1a8e4e156", "12f8c591a4e257bcc26b385424cd8d47",
+    "0b273b03d817af587c8fb23de71f346d", "1d7592fe89c661e9f61d215d235aa2ee",
+    "331dc544956ee14064ab432c85d52828", "a0a4ccbe1c442717ad40b7d40ed81a40",
+    "45009d915bf1d4ab855b5b670d314839", "641dfe93841aaa18888cebb17b8566eb",
+    "2b177c880ce0c2b4e891abc1dc23dfc2", "23984491f7d6c206fb8babafc9aacfdb",
+    "5841b93edb22c702035e31b26c58a728", "9852506766cb47f48783640d14753089",
+    "8a43698d32f63b1e7191482e4b274fc3", "7bdef02623beae507a651ad398422876",
+    "b105138645ad27657a08a3a8e8871a7e", "913e40ebbf1b983ca4956b85364b9459",
+    "5776f97b4f0cfa435a99d5d90822922d", "a0ae92a24c2b20039d996ee2a7d8b107",
+    "a925cc792412e2a7abe89367c9fe28b1", "778183eab5c9e0ee559d828d8347a21c",
+    "c4b4777355a4c8e8858faec37ba23eec", "4cdd41c3648e8d05c3e8f58d08385f8b",
+    "7c1246737874f984feb1b5827a1f95db", "c75d766ff5af8db39d400962d5aba0b4",
+    "964f010f5aa6748461ca5573b013091d", "b003f3eab3b118e5a8a85c1873b3bb55"};
+
+TEST(WedgePredictionMaskTest, GenerateWedgeMask) {
+  WedgeMaskArray wedge_masks;
+  ASSERT_TRUE(GenerateWedgeMask(&wedge_masks));
+
+  // Check wedge masks.
+  int block_size_index = 0;
+  int index = 0;
+  for (int block_size = kBlock8x8; block_size < kMaxBlockSizes; ++block_size) {
+    const int width = kBlockWidthPixels[block_size];
+    const int height = kBlockHeightPixels[block_size];
+    if (width < 8 || height < 8 || width > 32 || height > 32) continue;
+
+    for (int flip_sign = 0; flip_sign <= 1; ++flip_sign) {
+      for (int direction = 0; direction < kWedgeDirectionTypes; ++direction) {
+        uint8_t* const block_wedge_mask =
+            wedge_masks[block_size_index][flip_sign][direction][0];
+        const std::string digest =
+            test_utils::GetMd5Sum(block_wedge_mask, width * height);
+        EXPECT_STREQ(digest.c_str(), kExpectedWedgeMask[index]);
+        index++;
+      }
+    }
+    block_size_index++;
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/quantizer.cc b/src/quantizer.cc
new file mode 100644 (file)
index 0000000..eb13314
--- /dev/null
@@ -0,0 +1,344 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 && \
+    LIBGAV1_MAX_BITDEPTH != 12
+#error LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12
+#endif
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/quantizer_tables.inc"
+
+// Format the kDcLookup and kAcLookup arrays manually for easier comparison
+// with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2.
+
+// clang-format off
+constexpr int16_t kDcLookup[][256] = {
+  // Lookup table for 8 bit.
+  {
+    4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16,
+    17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26,
+    27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37,
+    38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47,
+    48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57,
+    57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66,
+    67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76,
+    77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85,
+    87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104,
+    105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121,
+    123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146,
+    148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174,
+    177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208,
+    211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+    250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292,
+    296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344,
+    349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406,
+    411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482,
+    489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590,
+    602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775,
+    796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+    1184, 1232, 1282, 1336
+  },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Lookup table for 10 bit.
+  {
+    4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34,
+    37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75,
+    78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120,
+    124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166,
+    170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212,
+    215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255,
+    259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297,
+    300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337,
+    343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412,
+    418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484,
+    490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584,
+    592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698,
+    708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831,
+    844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988,
+    1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170,
+    1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379,
+    1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624,
+    1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929,
+    1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363,
+    2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102,
+    3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559,
+    4737, 4929, 5130, 5347
+  },
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+  // Lookup table for 12 bit.
+  {
+    4, 12, 18, 25, 33, 41, 50, 60,
+    70, 80, 91, 103, 115, 127, 140, 153,
+    166, 180, 194, 208, 222, 237, 251, 266,
+    281, 296, 312, 327, 343, 358, 374, 390,
+    405, 421, 437, 453, 469, 484, 500, 516,
+    532, 548, 564, 580, 596, 611, 627, 643,
+    659, 674, 690, 706, 721, 737, 752, 768,
+    783, 798, 814, 829, 844, 859, 874, 889,
+    904, 919, 934, 949, 964, 978, 993, 1008,
+    1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122,
+    1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234,
+    1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342,
+    1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544,
+    1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741,
+    1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933,
+    1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199,
+    2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467,
+    2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788,
+    2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127,
+    3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517,
+    3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951,
+    4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420,
+    4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942,
+    5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517,
+    5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149,
+    6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867,
+    6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715,
+    7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788,
+    8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245,
+    10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+    12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+    16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+};
+
+constexpr int16_t kAcLookup[][256] = {
+  // Lookup table for 8 bit.
+  {
+    4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+    31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+    43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+    55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+    67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
+    79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+    91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+    104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+    128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
+    152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185,
+    188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227,
+    231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280,
+    285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347,
+    353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432,
+    440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540,
+    550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676,
+    689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848,
+    864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066,
+    1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692,
+    1725, 1759, 1793, 1828
+  },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Lookup table for 10 bit.
+  {
+    4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37,
+    40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83,
+    88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136,
+    140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190,
+    195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244,
+    249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297,
+    302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349,
+    354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401,
+    409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498,
+    506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596,
+    604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737,
+    749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905,
+    922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118,
+    1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386,
+    1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727,
+    1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159,
+    2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703,
+    2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391,
+    3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264,
+    4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372,
+    5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768,
+    6900, 7036, 7172, 7312
+  },
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+  // Lookup table for 12 bit.
+  {
+    4, 13, 19, 27, 35, 44, 54, 64,
+    75, 87, 99, 112, 126, 139, 154, 168,
+    183, 199, 214, 230, 247, 263, 280, 297,
+    314, 331, 349, 366, 384, 402, 420, 438,
+    456, 475, 493, 511, 530, 548, 567, 586,
+    604, 623, 642, 660, 679, 698, 716, 735,
+    753, 772, 791, 809, 828, 846, 865, 884,
+    902, 920, 939, 957, 976, 994, 1012, 1030,
+    1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175,
+    1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317,
+    1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457,
+    1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595,
+    1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856,
+    1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118,
+    2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378,
+    2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750,
+    2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137,
+    3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619,
+    3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149,
+    4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791,
+    4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544,
+    5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410,
+    6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435,
+    7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635,
+    8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028,
+    10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+    11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+    13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+    16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+    18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+    21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+    25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+};
+// clang-format on
+
+void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width,
+               int src_height) {
+  const int dst_width = src_height;
+  const int dst_height = src_width;
+  Array2DView<const uint8_t> source(src_height, src_width, src);
+  Array2DView<uint8_t> dest(dst_height, dst_width, dst);
+  for (int y = 0; y < dst_height; ++y) {
+    for (int x = 0; x < dst_width; ++x) {
+      dest[y][x] = source[x][y];
+    }
+  }
+}
+
+// Copies the lower triangle and fills the upper triangle of |dst| using |src|
+// as the source.
+void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) {
+  Array2DView<uint8_t> dest(size, size, dst);
+  int k = 0;
+  for (int y = 0; y < size; ++y) {
+    for (int x = 0; x <= y; ++x) {
+      dest[y][x] = dest[x][y] = src[k++];
+    }
+  }
+}
+
+}  // namespace
+
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) {
+  for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) {
+    for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes;
+         ++plane_type) {
+      auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type];
+      // Notes about how these matrices are populated:
+      // * For square transforms, we store only the lower left triangle (it is
+      // symmetric about the main diagonal. So when populating the matrix, we
+      // will have to fill in the upper right triangle.
+      // * For rectangular transforms, the matrices are transposes when the
+      // width and height are reversed. So when populating we populate it with
+      // memcpy when w < h and populate it by transposing when w > h.
+      // * There is a special case for 16x16 where the matrix is the same as
+      // 32x32 with some offsets.
+      // * We use the "adjusted transform size" when using these matrices, so we
+      // won't have to populate them for transform sizes with one of the
+      // dimensions equal to 64.
+      for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) {
+        if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) {
+          continue;
+        }
+        const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size];
+        if (!quantizer_matrix[tx_size].Resize(size)) {
+          return false;
+        }
+      }
+#define QUANTIZER_MEMCPY(W, H)                            \
+  memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \
+         kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H))
+#define QUANTIZER_TRANSPOSE(W, H)                            \
+  Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \
+            kQuantizerMatrix##H##x##W[level][plane_type], H, W)
+#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE)                                \
+  FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \
+                    kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE)
+      QUANTIZER_FILL_UPPER_TRIANGLE(4);   // 4x4
+      QUANTIZER_MEMCPY(4, 8);             // 4x8
+      QUANTIZER_MEMCPY(4, 16);            // 4x16
+      QUANTIZER_TRANSPOSE(8, 4);          // 8x4
+      QUANTIZER_FILL_UPPER_TRIANGLE(8);   // 8x8
+      QUANTIZER_MEMCPY(8, 16);            // 8x16
+      QUANTIZER_MEMCPY(8, 32);            // 8x32
+      QUANTIZER_TRANSPOSE(16, 4);         // 16x4
+      QUANTIZER_TRANSPOSE(16, 8);         // 16x8
+      QUANTIZER_MEMCPY(16, 32);           // 16x32
+      QUANTIZER_TRANSPOSE(32, 8);         // 32x8
+      QUANTIZER_TRANSPOSE(32, 16);        // 32x16
+      QUANTIZER_FILL_UPPER_TRIANGLE(32);  // 32x32
+      // 16x16.
+      Array2DView<uint8_t> dst16x16(
+          16, 16, quantizer_matrix[kTransformSize16x16].get());
+      Array2DView<const uint8_t> src32x32(
+          32, 32, quantizer_matrix[kTransformSize32x32].get());
+      for (int y = 0; y < 16; ++y) {
+        for (int x = 0; x < 16; ++x) {
+          dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)];
+        }
+      }
+#undef QUANTIZER_FILL_UPPER_TRIANGLE
+#undef QUANTIZER_TRANSPOSE
+#undef QUANTIZER_MEMCPY
+    }
+  }
+  return true;
+}
+
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) {
+  if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) {
+    const int segment_qindex =
+        base_qindex +
+        segmentation.feature_data[index][kSegmentFeatureQuantizer];
+    return Clip3(segment_qindex, kMinQuantizer, kMaxQuantizer);
+  }
+  return base_qindex;
+}
+
+Quantizer::Quantizer(int bitdepth, const QuantizerParameters* params)
+    : params_(*params) {
+  assert(bitdepth >= 8 && bitdepth <= LIBGAV1_MAX_BITDEPTH);
+  const int index = BitdepthToArrayIndex(bitdepth);
+  dc_lookup_ = kDcLookup[index];
+  ac_lookup_ = kAcLookup[index];
+}
+
+int Quantizer::GetDcValue(Plane plane, int qindex) const {
+  return dc_lookup_[Clip3(qindex + params_.delta_dc[plane], kMinQuantizer,
+                          kMaxQuantizer)];
+}
+
+int Quantizer::GetAcValue(Plane plane, int qindex) const {
+  return ac_lookup_[Clip3(qindex + params_.delta_ac[plane], kMinQuantizer,
+                          kMaxQuantizer)];
+}
+
+}  // namespace libgav1
diff --git a/src/quantizer.h b/src/quantizer.h
new file mode 100644 (file)
index 0000000..c60756c
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_QUANTIZER_H_
+#define LIBGAV1_SRC_QUANTIZER_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+using QuantizerMatrix = std::array<
+    std::array<std::array<DynamicBuffer<uint8_t>, kNumTransformSizes>,
+               kNumPlaneTypes>,
+    kNumQuantizerLevelsForQuantizerMatrix>;
+
+// Implements the dequantization functions of Section 7.12.2.
+class Quantizer {
+ public:
+  Quantizer(int bitdepth, const QuantizerParameters* params);
+
+  // Returns the quantizer value for the dc coefficient for the given plane.
+  // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+  // the |base_qindex| argument, and pass the return value as the |qindex|
+  // argument to this method.
+  int GetDcValue(Plane plane, int qindex) const;
+
+  // Returns the quantizer value for the ac coefficient for the given plane.
+  // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+  // the |base_qindex| argument, and pass the return value as the |qindex|
+  // argument to this method.
+  int GetAcValue(Plane plane, int qindex) const;
+
+ private:
+  const QuantizerParameters& params_;
+  const int16_t* dc_lookup_;
+  const int16_t* ac_lookup_;
+};
+
+// Initialize the quantizer matrix.
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix);
+
+// Get the quantizer index for the |index|th segment.
+//
+// This function has two use cases. What should be passed as the |base_qindex|
+// argument depends on the use case.
+// 1. While parsing the uncompressed header or transform type, pass
+//    Quantizer::base_index.
+//    Note: In this use case, the caller only cares about whether the return
+//    value is zero.
+// 2. To generate the |qindex| argument to Quantizer::GetDcQuant() or
+//    Quantizer::GetAcQuant(), pass Tile::current_quantizer_index_.
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_QUANTIZER_H_
diff --git a/src/quantizer_tables.inc b/src/quantizer_tables.inc
new file mode 100644 (file)
index 0000000..34342c4
--- /dev/null
@@ -0,0 +1,3080 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the quantizer table
+// definitions from the quantizer functions.
+
+constexpr uint8_t kQuantizerMatrix4x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = {
+        {{32,  42,  75, 91,  33,  42,  69,  86,  37,  58, 84,
+          91,  49,  71, 103, 110, 65,  84,  125, 128, 80, 97,
+          142, 152, 91, 100, 145, 178, 104, 112, 146, 190},
+         {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64,  48, 61, 75, 73,
+          54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}},
+        {{32,  42,  69, 88, 33,  42,  64, 83,  36,  56, 77,
+          88,  46,  67, 93, 105, 60,  79, 112, 122, 75, 92,
+          130, 144, 86, 95, 136, 167, 98, 105, 136, 177},
+         {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72,
+          52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}},
+        {{32,  38,  62, 86, 32,  40,  58, 80, 34,  51, 68,
+          85,  44,  61, 85, 101, 54,  69, 98, 117, 72, 84,
+          118, 136, 82, 89, 129, 157, 92, 98, 127, 165},
+         {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71,
+          50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}},
+        {{32,  35,  59, 83, 32,  36,  57, 78, 34,  47, 65,
+          82,  41,  53, 78, 97,  51,  61, 92, 111, 65, 73,
+          108, 129, 75, 81, 117, 148, 86, 92, 119, 154},
+         {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70,
+          49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}},
+        {{32, 35, 51, 77,  32, 36, 50, 72,  34, 42, 54,  75,  38, 51, 67,  87,
+          48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144},
+         {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65,
+          47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}},
+        {{32, 35, 51, 75, 32, 36, 50, 71,  34, 42, 54, 73,  37, 50, 65,  84,
+          45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136},
+         {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64,
+          46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}},
+        {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58,  35, 43, 54, 68,
+          41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111},
+         {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57,
+          45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}},
+        {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59,
+          38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97},
+         {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53,
+          46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}},
+        {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54,
+          35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83},
+         {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50,
+          47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}},
+        {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42,
+          34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67},
+         {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45,
+          43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40,
+          33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56},
+         {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46,
+          40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}},
+        {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36,
+          32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47,
+          37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36},
+         {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40,
+          34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}},
+        {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix4x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = {
+        {{31,  44,  79,  96,  32,  41,  72,  90,  32,  42,  71,  86,  34,
+          48,  73,  83,  34,  54,  78,  89,  41,  63,  90,  95,  45,  67,
+          96,  102, 54,  75,  110, 111, 60,  79,  118, 123, 72,  90,  133,
+          135, 75,  92,  136, 149, 83,  100, 142, 160, 88,  100, 140, 173,
+          94,  101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197},
+         {31, 49, 63, 69,  32, 45, 57, 65,  36, 46, 56, 62,  43, 49, 57, 60,
+          46, 53, 60, 63,  45, 58, 67, 66,  46, 59, 71, 70,  50, 62, 78, 74,
+          52, 64, 82, 80,  57, 67, 89, 85,  59, 68, 90, 91,  62, 71, 91, 96,
+          63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}},
+        {{31,  44, 73,  93,  32,  41,  67,  87,  32,  42,  65,  83,  33,
+          44,  66, 81,  34,  54,  74,  86,  37,  58,  79,  92,  44,  66,
+          90,  98, 49,  71,  99,  107, 56,  77,  107, 117, 65,  84,  119,
+          129, 72, 90,  127, 141, 78,  95,  133, 151, 84,  95,  132, 163,
+          89,  95, 136, 169, 95,  101, 132, 175, 101, 108, 141, 183},
+         {31, 49, 61, 69, 32, 45, 55, 64,  36, 46, 54, 61,  41, 47, 54, 59,
+          46, 53, 59, 62, 46, 56, 62, 65,  46, 59, 68, 68,  48, 61, 73, 73,
+          51, 63, 77, 78, 54, 65, 82, 84,  57, 67, 86, 89,  60, 69, 88, 93,
+          62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}},
+        {{31,  39, 65,  90,  32,  38,  60,  84,  32,  39,  59,  81,  33,
+          40,  58, 78,  34,  47,  65,  83,  37,  54,  73,  89,  41,  58,
+          79,  94, 46,  62,  86,  102, 53,  68,  97,  112, 60,  73,  105,
+          123, 65, 78,  111, 134, 74,  85,  120, 143, 79,  90,  125, 154,
+          84,  90, 128, 158, 89,  95,  124, 164, 94,  101, 131, 170},
+         {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58,
+          44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71,
+          49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91,
+          60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}},
+        {{31,  36, 62,  88,  32,  35, 58,  82,  32,  36,  57,  79,  33,
+          38,  56, 76,  34,  42,  61, 81,  34,  48,  66,  85,  39,  51,
+          74,  91, 44,  56,  82,  98, 49,  60,  90,  107, 54,  63,  95,
+          117, 60, 68,  102, 127, 68, 75,  110, 135, 75,  81,  117, 145,
+          79,  85, 120, 148, 84,  89, 116, 153, 88,  94,  123, 159},
+         {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57,
+          43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70,
+          48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89,
+          58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}},
+        {{31,  36, 53,  81,  32,  35, 51,  76,  32,  35, 49,  73,  32,
+          37,  49, 71,  33,  41,  53, 74,  34,  48,  60, 80,  37,  50,
+          65,  85, 41,  53,  71,  91, 45,  56,  76,  98, 49,  60,  82,
+          105, 54, 63,  87,  112, 61, 69,  93,  121, 68, 75,  100, 130,
+          74,  80, 105, 137, 78,  84, 109, 142, 83,  88, 114, 148},
+         {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56,
+          42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68,
+          46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83,
+          56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}},
+        {{31, 36, 53, 79,  32, 35, 51, 75,  32, 34, 49,  72,  32, 36, 50,  71,
+          33, 38, 49, 69,  34, 42, 54, 73,  34, 48, 60,  78,  37, 50, 65,  84,
+          41, 53, 71, 90,  45, 56, 76, 96,  49, 60, 82,  103, 54, 63, 87,  110,
+          60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136},
+         {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56,
+          40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64,
+          45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78,
+          52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}},
+        {{31, 34, 44, 65, 32, 34, 43, 62,  32, 33, 41, 59,  32, 35, 43, 59,
+          32, 37, 43, 58, 34, 39, 48, 63,  34, 42, 53, 67,  36, 44, 57, 71,
+          39, 46, 60, 76, 42, 48, 64, 81,  45, 51, 67, 85,  50, 54, 72, 92,
+          54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118},
+         {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51,
+          40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58,
+          46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71,
+          50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}},
+        {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53,
+          32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63,
+          37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79,
+          50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97},
+         {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49,
+          37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55,
+          46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65,
+          48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}},
+        {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49,
+          32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54,
+          35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71,
+          45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87},
+         {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48,
+          35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50,
+          47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61,
+          47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}},
+        {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41,
+          32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44,
+          34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58,
+          40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67},
+         {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45,
+          34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47,
+          42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56,
+          47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37,
+          32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40,
+          32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51,
+          35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58},
+         {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45,
+          31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46,
+          38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53,
+          48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}},
+        {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36,
+          32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41,
+          34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46,
+          31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47,
+          36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49,
+          42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35,
+          32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37},
+         {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38,
+          31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40,
+          33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45,
+          38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}},
+        {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35,
+          31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37,
+          33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  33,  34,  49,  59,
+          78,  86,  93,  32,  34,  36,  50,  59,  77,  82,  89,  34,  37,
+          42,  54,  63,  79,  80,  88,  36,  38,  48,  60,  68,  84,  86,
+          90,  44,  43,  53,  71,  79,  95,  94,  97,  48,  46,  56,  76,
+          85,  102, 105, 105, 58,  54,  63,  87,  98,  116, 112, 115, 65,
+          58,  68,  92,  105, 124, 122, 124, 79,  70,  79,  104, 118, 141,
+          135, 135, 82,  72,  81,  106, 121, 144, 149, 146, 91,  80,  88,
+          106, 130, 148, 162, 159, 97,  86,  94,  107, 128, 157, 167, 171,
+          103, 93,  98,  114, 131, 150, 174, 186, 110, 100, 101, 117, 138,
+          161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203},
+         {32, 37, 48, 52, 57, 66, 68,  71,  30, 40, 46, 48, 52, 60, 63,  66,
+          33, 43, 47, 47, 51, 59, 60,  63,  42, 47, 50, 50, 53, 60, 59,  62,
+          49, 48, 53, 54, 57, 62, 62,  62,  49, 46, 53, 61, 64, 69, 66,  66,
+          50, 46, 54, 64, 67, 73, 72,  70,  54, 49, 55, 68, 73, 80, 76,  75,
+          57, 50, 56, 70, 76, 84, 80,  79,  63, 55, 60, 75, 82, 92, 87,  84,
+          64, 56, 61, 75, 83, 93, 93,  89,  68, 59, 64, 74, 86, 94, 98,  94,
+          70, 62, 66, 73, 83, 96, 99,  98,  72, 64, 66, 75, 83, 92, 101, 104,
+          74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  44,  60,
+          72,  84,  90,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          40,  47,  60,  71,  78,  85,  36,  37,  48,  56,  68,  78,  83,
+          87,  39,  40,  50,  60,  73,  84,  91,  94,  47,  45,  56,  69,
+          84,  95,  101, 101, 53,  50,  60,  75,  92,  103, 108, 110, 61,
+          56,  65,  81,  100, 113, 116, 118, 71,  64,  73,  89,  111, 125,
+          129, 129, 79,  70,  79,  95,  118, 133, 142, 138, 86,  76,  84,
+          100, 124, 140, 153, 150, 92,  82,  89,  101, 121, 148, 157, 161,
+          98,  88,  93,  108, 124, 141, 163, 174, 104, 94,  95,  110, 129,
+          151, 171, 181, 110, 100, 98,  111, 127, 147, 169, 188},
+         {32, 35, 48, 50, 57, 63, 68,  70,  30, 38, 46, 46, 52, 58, 63, 65,
+          33, 41, 47, 46, 51, 56, 60,  63,  39, 46, 48, 47, 51, 55, 58, 61,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 46, 53, 56, 60, 64, 65, 65,
+          50, 46, 54, 61, 66, 70, 71,  69,  52, 47, 54, 63, 71, 75, 75, 74,
+          55, 49, 56, 65, 74, 79, 79,  78,  60, 53, 58, 68, 79, 85, 85, 82,
+          63, 55, 60, 70, 82, 89, 91,  87,  66, 58, 62, 72, 84, 91, 95, 91,
+          68, 60, 64, 71, 81, 94, 97,  96,  70, 62, 65, 73, 81, 89, 98, 101,
+          72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  41,  54,
+          73,  81,  88,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          38,  42,  52,  69,  76,  82,  34,  36,  44,  50,  59,  75,  81,
+          84,  39,  39,  50,  58,  68,  84,  88,  90,  44,  42,  53,  63,
+          74,  90,  97,  97,  49,  46,  57,  67,  81,  97,  104, 105, 57,
+          53,  63,  74,  90,  108, 111, 113, 65,  59,  68,  79,  97,  118,
+          123, 122, 71,  64,  73,  84,  102, 125, 135, 131, 81,  72,  80,
+          91,  110, 135, 145, 141, 87,  77,  85,  96,  114, 140, 148, 151,
+          92,  83,  88,  102, 117, 133, 153, 163, 98,  88,  89,  103, 121,
+          141, 160, 169, 103, 94,  92,  103, 119, 137, 158, 175},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 36, 46, 46, 50, 58, 62, 65,
+          33, 40, 47, 46, 49, 56, 59, 62,  37, 44, 47, 45, 48, 54, 57, 60,
+          44, 46, 51, 51, 53, 59, 60, 61,  48, 46, 53, 56, 58, 64, 64, 64,
+          49, 45, 53, 58, 62, 67, 70, 68,  51, 47, 54, 60, 65, 71, 73, 72,
+          54, 49, 55, 62, 70, 77, 77, 76,  57, 51, 56, 64, 73, 82, 83, 81,
+          60, 53, 58, 65, 75, 85, 89, 85,  64, 57, 61, 68, 78, 89, 93, 89,
+          66, 59, 63, 69, 79, 91, 94, 93,  68, 61, 63, 71, 79, 87, 96, 98,
+          70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}},
+        {{32, 31, 35,  44,  53,  65,  82,  90, 31, 32, 34,  41,  50,  61,  76,
+          85, 31, 33,  35,  42,  49,  59,  73, 81, 32, 34,  37,  42,  49,  58,
+          71, 79, 34,  35,  41,  48,  54,  63, 76, 81, 36,  36,  46,  54,  60,
+          68, 80, 87,  41,  40,  49,  60,  67, 76, 88, 93,  47,  44,  53,  66,
+          75, 84, 97,  101, 53,  50,  57,  71, 82, 92, 106, 108, 58,  54,  61,
+          75, 87, 98,  112, 116, 65,  59,  66, 79, 92, 105, 120, 124, 74,  67,
+          73, 86, 100, 113, 131, 134, 82,  73, 79, 92, 105, 120, 139, 142, 87,
+          78, 83, 96,  110, 125, 144, 153, 92, 83, 84, 97,  114, 132, 150, 157,
+          97, 88, 86,  97,  111, 128, 147, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64,
+          33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59,
+          42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63,
+          48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79,
+          57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87,
+          64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95,
+          68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}},
+        {{32, 31, 33, 40,  51,  65,  79,  87, 31, 32, 33, 39,  49,  61,  74,
+          82, 31, 32, 34,  38,  47,  59,  71, 79, 32, 33, 36,  40,  48,  58,
+          69, 77, 33, 34,  38,  44,  52,  62, 72, 78, 36, 35,  42,  51,  58,
+          68, 78, 84, 39,  38,  44,  54,  63, 73, 84, 89, 44,  41,  46,  59,
+          69, 79, 90, 96,  48,  45,  50,  62, 74, 85, 96, 103, 53,  49,  53,
+          66, 79, 92, 103, 111, 58,  54,  57, 70, 84, 98, 110, 118, 66,  60,
+          63, 75, 90, 106, 119, 126, 74,  67, 69, 81, 97, 113, 128, 134, 81,
+          73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147,
+          91, 82, 80, 90,  103, 119, 137, 151},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63,
+          31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58,
+          41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62,
+          48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69,
+          50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85,
+          61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92,
+          66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}},
+        {{32, 31, 32, 36, 44, 53, 62,  73,  31, 32, 32, 35, 42, 51,  59,  69,
+          31, 32, 33, 34, 41, 49, 57,  66,  32, 32, 34, 36, 42, 50,  57,  65,
+          32, 33, 35, 38, 42, 49, 56,  64,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59, 65,  73,  38, 36, 40, 49, 56, 63,  69,  77,
+          41, 39, 41, 51, 60, 67, 74,  81,  44, 42, 43, 54, 64, 72,  79,  86,
+          48, 45, 46, 56, 67, 76, 83,  91,  53, 49, 50, 60, 71, 82,  90,  99,
+          58, 54, 54, 63, 75, 87, 95,  105, 65, 60, 58, 68, 79, 92,  102, 112,
+          71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57,
+          30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73,
+          54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79,
+          60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 51, 62,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 34, 35, 38, 42, 49, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          34, 34, 37, 41, 44, 48, 54, 63,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  41, 39, 41, 49, 54, 60, 67, 76,
+          44, 41, 43, 51, 57, 63, 71, 79,  48, 45, 46, 54, 60, 67, 76, 85,
+          53, 49, 50, 57, 64, 71, 82, 92,  57, 53, 53, 60, 67, 74, 86, 97,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54,
+          30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67,
+          52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63,
+          39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71,
+          47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77,
+          53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48,
+          33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64,
+          52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47,
+          34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55,
+          36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47,
+          42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49,
+          42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  32,  35,  51,  62,  82,
+          88,  94,  31,  33,  34,  49,  59,  78,  86,  93,  31,  33,  35,  49,
+          59,  78,  84,  90,  32,  34,  36,  50,  59,  77,  82,  89,  32,  35,
+          38,  49,  58,  75,  82,  89,  34,  37,  42,  54,  63,  79,  80,  88,
+          35,  37,  45,  57,  65,  82,  84,  87,  36,  38,  48,  60,  68,  84,
+          86,  90,  39,  40,  50,  65,  73,  89,  91,  93,  44,  43,  53,  71,
+          79,  95,  94,  97,  46,  44,  55,  73,  82,  98,  98,  99,  48,  46,
+          56,  76,  85,  102, 105, 105, 53,  50,  60,  82,  92,  109, 107, 107,
+          58,  54,  63,  87,  98,  116, 112, 115, 61,  56,  66,  89,  101, 120,
+          119, 116, 65,  58,  68,  92,  105, 124, 122, 124, 71,  63,  73,  97,
+          111, 132, 130, 127, 79,  70,  79,  104, 118, 141, 135, 135, 81,  71,
+          80,  105, 119, 142, 140, 139, 82,  72,  81,  106, 121, 144, 149, 146,
+          88,  77,  85,  108, 126, 149, 153, 152, 91,  80,  88,  106, 130, 148,
+          162, 159, 94,  83,  91,  105, 131, 153, 165, 166, 97,  86,  94,  107,
+          128, 157, 167, 171, 100, 89,  97,  111, 127, 152, 173, 182, 103, 93,
+          98,  114, 131, 150, 174, 186, 107, 96,  100, 117, 136, 155, 177, 191,
+          110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159,
+          185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119,
+          136, 156, 179, 204},
+         {32, 37, 48, 52, 57, 66, 68,  71,  31, 38, 47, 50, 54, 63, 65,  67,
+          30, 40, 46, 48, 52, 60, 63,  66,  32, 41, 46, 48, 51, 59, 62,  64,
+          33, 43, 47, 47, 51, 59, 60,  63,  37, 47, 47, 47, 50, 57, 60,  62,
+          42, 47, 50, 50, 53, 60, 59,  62,  45, 47, 51, 52, 55, 61, 61,  61,
+          49, 48, 53, 54, 57, 62, 62,  62,  48, 47, 53, 57, 60, 66, 65,  64,
+          49, 46, 53, 61, 64, 69, 66,  66,  49, 46, 53, 62, 65, 71, 68,  67,
+          50, 46, 54, 64, 67, 73, 72,  70,  52, 47, 54, 66, 71, 77, 73,  71,
+          54, 49, 55, 68, 73, 80, 76,  75,  55, 49, 56, 69, 75, 82, 79,  76,
+          57, 50, 56, 70, 76, 84, 80,  79,  60, 52, 58, 72, 79, 88, 84,  81,
+          63, 55, 60, 75, 82, 92, 87,  84,  64, 55, 61, 75, 82, 92, 89,  86,
+          64, 56, 61, 75, 83, 93, 93,  89,  67, 58, 63, 76, 85, 95, 94,  91,
+          68, 59, 64, 74, 86, 94, 98,  94,  69, 60, 65, 72, 85, 95, 99,  97,
+          70, 62, 66, 73, 83, 96, 99,  98,  71, 63, 67, 74, 82, 93, 102, 102,
+          72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106,
+          74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109,
+          76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99,  108}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  45,  62,  75,
+          86,  91,  31,  32,  35,  44,  60,  72,  84,  90,  31,  33,  35,  44,
+          59,  71,  82,  87,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          38,  45,  58,  69,  80,  86,  32,  35,  40,  47,  60,  71,  78,  85,
+          34,  36,  42,  50,  63,  73,  82,  84,  36,  37,  48,  56,  68,  78,
+          83,  87,  38,  39,  49,  58,  71,  81,  88,  90,  39,  40,  50,  60,
+          73,  84,  91,  94,  44,  42,  53,  66,  79,  90,  94,  96,  47,  45,
+          56,  69,  84,  95,  101, 101, 49,  47,  57,  71,  86,  97,  103, 102,
+          53,  50,  60,  75,  92,  103, 108, 110, 58,  54,  63,  79,  98,  110,
+          114, 111, 61,  56,  65,  81,  100, 113, 116, 118, 65,  59,  68,  84,
+          105, 118, 124, 121, 71,  64,  73,  89,  111, 125, 129, 129, 76,  68,
+          76,  92,  115, 130, 134, 132, 79,  70,  79,  95,  118, 133, 142, 138,
+          82,  73,  81,  97,  121, 136, 145, 144, 86,  76,  84,  100, 124, 140,
+          153, 150, 89,  79,  87,  99,  124, 145, 156, 156, 92,  82,  89,  101,
+          121, 148, 157, 161, 95,  85,  92,  105, 120, 143, 163, 171, 98,  88,
+          93,  108, 124, 141, 163, 174, 101, 91,  94,  110, 128, 146, 166, 179,
+          104, 94,  95,  110, 129, 151, 171, 181, 107, 97,  96,  110, 128, 149,
+          173, 188, 110, 100, 98,  111, 127, 147, 169, 188, 114, 104, 100, 111,
+          127, 145, 166, 190},
+         {32, 35, 48, 50, 57, 63, 68,  70,  31, 37, 47, 48, 54, 60, 64,  66,
+          30, 38, 46, 46, 52, 58, 63,  65,  31, 38, 46, 46, 52, 57, 61,  63,
+          33, 41, 47, 46, 51, 56, 60,  63,  37, 45, 47, 46, 50, 54, 59,  62,
+          39, 46, 48, 47, 51, 55, 58,  61,  42, 46, 50, 50, 53, 57, 60,  60,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 47, 53, 55, 58, 62, 64,  63,
+          48, 46, 53, 56, 60, 64, 65,  65,  49, 45, 53, 59, 64, 67, 67,  66,
+          50, 46, 54, 61, 66, 70, 71,  69,  51, 47, 54, 61, 68, 71, 72,  70,
+          52, 47, 54, 63, 71, 75, 75,  74,  54, 49, 55, 65, 73, 78, 78,  74,
+          55, 49, 56, 65, 74, 79, 79,  78,  57, 50, 56, 66, 76, 82, 83,  79,
+          60, 53, 58, 68, 79, 85, 85,  82,  62, 54, 60, 69, 81, 87, 87,  84,
+          63, 55, 60, 70, 82, 89, 91,  87,  64, 56, 61, 71, 83, 90, 92,  89,
+          66, 58, 62, 72, 84, 91, 95,  91,  67, 59, 63, 71, 83, 93, 96,  94,
+          68, 60, 64, 71, 81, 94, 97,  96,  69, 61, 65, 72, 80, 91, 99,  100,
+          70, 62, 65, 73, 81, 89, 98,  101, 71, 64, 65, 73, 82, 90, 99,  103,
+          72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105,
+          74, 67, 65, 71, 79, 89, 98,  105, 75, 68, 65, 71, 78, 87, 96,  105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  42,  55,  75,
+          83,  88,  31,  32,  35,  41,  54,  73,  81,  88,  31,  32,  34,  41,
+          53,  72,  79,  84,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          37,  42,  53,  70,  77,  83,  32,  34,  38,  42,  52,  69,  76,  82,
+          34,  35,  42,  48,  57,  73,  79,  81,  34,  36,  44,  50,  59,  75,
+          81,  84,  36,  37,  48,  54,  63,  78,  85,  86,  39,  39,  50,  58,
+          68,  84,  88,  90,  40,  40,  51,  59,  70,  85,  91,  92,  44,  42,
+          53,  63,  74,  90,  97,  97,  47,  45,  56,  66,  79,  95,  99,  98,
+          49,  46,  57,  67,  81,  97,  104, 105, 53,  50,  60,  71,  86,  103,
+          109, 106, 57,  53,  63,  74,  90,  108, 111, 113, 59,  54,  64,  75,
+          91,  111, 119, 115, 65,  59,  68,  79,  97,  118, 123, 122, 69,  62,
+          71,  83,  100, 122, 127, 125, 71,  64,  73,  84,  102, 125, 135, 131,
+          79,  71,  79,  90,  109, 133, 137, 136, 81,  72,  80,  91,  110, 135,
+          145, 141, 82,  73,  81,  92,  111, 136, 147, 147, 87,  77,  85,  96,
+          114, 140, 148, 151, 90,  80,  87,  99,  113, 135, 153, 160, 92,  83,
+          88,  102, 117, 133, 153, 163, 95,  85,  88,  103, 120, 137, 155, 168,
+          98,  88,  89,  103, 121, 141, 160, 169, 100, 91,  90,  103, 120, 139,
+          161, 175, 103, 94,  92,  103, 119, 137, 158, 175, 106, 97,  93,  104,
+          118, 135, 155, 176},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 35, 47, 47, 51, 60, 63, 65,
+          31, 36, 46, 46, 50, 58, 62, 65,  30, 36, 46, 45, 49, 57, 60, 62,
+          33, 40, 47, 46, 49, 56, 59, 62,  35, 42, 47, 45, 48, 55, 58, 61,
+          37, 44, 47, 45, 48, 54, 57, 60,  42, 45, 50, 49, 51, 57, 59, 59,
+          44, 46, 51, 51, 53, 59, 60, 61,  49, 47, 53, 53, 55, 60, 63, 62,
+          48, 46, 53, 56, 58, 64, 64, 64,  48, 46, 53, 56, 59, 65, 66, 65,
+          49, 45, 53, 58, 62, 67, 70, 68,  50, 46, 54, 59, 65, 70, 70, 68,
+          51, 47, 54, 60, 65, 71, 73, 72,  52, 47, 54, 61, 68, 75, 76, 73,
+          54, 49, 55, 62, 70, 77, 77, 76,  54, 49, 55, 62, 70, 78, 81, 77,
+          57, 51, 56, 64, 73, 82, 83, 81,  59, 52, 58, 65, 74, 84, 85, 82,
+          60, 53, 58, 65, 75, 85, 89, 85,  63, 56, 60, 67, 77, 89, 90, 87,
+          64, 57, 61, 68, 78, 89, 93, 89,  64, 57, 61, 68, 78, 90, 94, 92,
+          66, 59, 63, 69, 79, 91, 94, 93,  67, 60, 63, 70, 78, 88, 96, 97,
+          68, 61, 63, 71, 79, 87, 96, 98,  69, 62, 63, 71, 80, 88, 96, 100,
+          70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102,
+          72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}},
+        {{32,  31,  35,  44,  53,  65,  82,  90,  31,  32,  35,  42,  51,  62,
+          78,  86,  31,  32,  34,  41,  50,  61,  76,  85,  31,  32,  34,  41,
+          49,  59,  74,  82,  31,  33,  35,  42,  49,  59,  73,  81,  32,  33,
+          36,  42,  50,  59,  73,  80,  32,  34,  37,  42,  49,  58,  71,  79,
+          32,  34,  39,  44,  51,  60,  73,  78,  34,  35,  41,  48,  54,  63,
+          76,  81,  35,  36,  45,  52,  59,  67,  79,  83,  36,  36,  46,  54,
+          60,  68,  80,  87,  39,  39,  48,  58,  65,  73,  86,  88,  41,  40,
+          49,  60,  67,  76,  88,  93,  44,  42,  51,  63,  71,  79,  92,  94,
+          47,  44,  53,  66,  75,  84,  97,  101, 48,  45,  54,  67,  76,  85,
+          98,  101, 53,  50,  57,  71,  82,  92,  106, 108, 55,  51,  59,  72,
+          84,  94,  108, 110, 58,  54,  61,  75,  87,  98,  112, 116, 63,  58,
+          65,  78,  91,  103, 118, 119, 65,  59,  66,  79,  92,  105, 120, 124,
+          71,  64,  71,  84,  97,  111, 127, 129, 74,  67,  73,  86,  100, 113,
+          131, 134, 79,  71,  77,  90,  104, 118, 136, 139, 82,  73,  79,  92,
+          105, 120, 139, 142, 82,  74,  79,  92,  106, 121, 139, 150, 87,  78,
+          83,  96,  110, 125, 144, 153, 89,  81,  83,  97,  113, 128, 145, 157,
+          92,  83,  84,  97,  114, 132, 150, 157, 94,  85,  85,  97,  112, 130,
+          151, 163, 97,  88,  86,  97,  111, 128, 147, 163, 99,  91,  87,  97,
+          110, 126, 144, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64,
+          31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61,
+          33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60,
+          37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58,
+          42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61,
+          49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64,
+          48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67,
+          50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+          54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80,
+          57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85,
+          61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89,
+          64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94,
+          66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97,
+          68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99,
+          70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}},
+        {{32,  31,  33,  40,  51,  65,  79,  87,  31,  32,  33,  39,  49,  62,
+          75,  83,  31,  32,  33,  39,  49,  61,  74,  82,  31,  32,  33,  38,
+          47,  59,  72,  79,  31,  32,  34,  38,  47,  59,  71,  79,  32,  33,
+          35,  39,  48,  59,  71,  78,  32,  33,  36,  40,  48,  58,  69,  77,
+          32,  33,  36,  41,  48,  58,  69,  75,  33,  34,  38,  44,  52,  62,
+          72,  78,  34,  34,  39,  45,  53,  63,  73,  80,  36,  35,  42,  51,
+          58,  68,  78,  84,  36,  35,  42,  51,  59,  68,  79,  85,  39,  38,
+          44,  54,  63,  73,  84,  89,  40,  39,  45,  56,  65,  75,  85,  90,
+          44,  41,  46,  59,  69,  79,  90,  96,  46,  43,  48,  60,  72,  82,
+          93,  97,  48,  45,  50,  62,  74,  85,  96,  103, 52,  48,  52,  65,
+          78,  90,  101, 105, 53,  49,  53,  66,  79,  92,  103, 111, 58,  53,
+          57,  69,  83,  97,  109, 113, 58,  54,  57,  70,  84,  98,  110, 118,
+          65,  59,  62,  74,  89,  105, 118, 122, 66,  60,  63,  75,  90,  106,
+          119, 126, 71,  65,  67,  79,  94,  111, 125, 131, 74,  67,  69,  81,
+          97,  113, 128, 134, 79,  72,  73,  85,  101, 118, 133, 141, 81,  73,
+          75,  86,  102, 120, 135, 143, 82,  74,  75,  87,  103, 121, 136, 147,
+          86,  78,  78,  90,  106, 124, 140, 147, 88,  80,  80,  90,  105, 122,
+          140, 152, 91,  82,  80,  90,  103, 119, 137, 151, 93,  85,  81,  90,
+          103, 117, 134, 152},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63,
+          31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60,
+          31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59,
+          35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57,
+          41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60,
+          49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63,
+          48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66,
+          49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70,
+          50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+          52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83,
+          57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87,
+          61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92,
+          64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94,
+          66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96,
+          68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 32, 35, 42, 51,  62,  75,  31, 32, 33, 34, 41, 49,  59,  72,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 32, 34, 36, 42, 50,  59,  71,  32, 33, 35, 38, 42, 49,  58,  69,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          34, 34, 37, 42, 48, 54,  63,  73,  36, 34, 38, 48, 54, 60,  68,  78,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          39, 37, 40, 50, 58, 65,  73,  84,  44, 41, 43, 53, 63, 71,  79,  90,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          48, 45, 46, 56, 67, 76,  85,  96,  53, 49, 50, 60, 71, 82,  92,  103,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          58, 54, 54, 63, 75, 87,  98,  110, 65, 60, 58, 68, 79, 92,  105, 118,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          71, 65, 63, 73, 84, 97,  111, 125, 79, 72, 70, 79, 90, 104, 118, 133,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
+          82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90,
+          64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}},
+        {{32, 31, 32, 36, 44, 53,  62,  73,  31, 32, 32, 35, 42, 51,  60,  70,
+          31, 32, 32, 35, 42, 51,  59,  69,  31, 32, 32, 35, 41, 50,  58,  67,
+          31, 32, 33, 34, 41, 49,  57,  66,  31, 32, 33, 35, 41, 49,  57,  66,
+          32, 32, 34, 36, 42, 50,  57,  65,  32, 32, 34, 37, 42, 49,  56,  65,
+          32, 33, 35, 38, 42, 49,  56,  64,  32, 33, 35, 39, 43, 50,  56,  64,
+          34, 34, 37, 42, 48, 54,  61,  69,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59,  65,  73,  36, 34, 38, 48, 54, 60,  66,  74,
+          38, 36, 40, 49, 56, 63,  69,  77,  39, 37, 40, 50, 58, 65,  71,  79,
+          41, 39, 41, 51, 60, 67,  74,  81,  44, 41, 43, 53, 63, 71,  78,  85,
+          44, 42, 43, 54, 64, 72,  79,  86,  48, 45, 46, 56, 67, 76,  83,  91,
+          48, 45, 46, 56, 67, 76,  83,  91,  53, 49, 49, 59, 71, 81,  89,  98,
+          53, 49, 50, 60, 71, 82,  90,  99,  57, 52, 52, 62, 74, 85,  94,  103,
+          58, 54, 54, 63, 75, 87,  95,  105, 61, 57, 56, 66, 77, 89,  98,  108,
+          65, 60, 58, 68, 79, 92,  102, 112, 67, 61, 60, 69, 81, 94,  103, 114,
+          71, 65, 63, 73, 84, 97,  108, 119, 72, 66, 64, 73, 85, 98,  108, 119,
+          79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58,
+          31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56,
+          30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54,
+          33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53,
+          37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53,
+          42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59,
+          48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+          49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72,
+          52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75,
+          54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78,
+          57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80,
+          60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83,
+          63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 52, 63,
+          31, 32, 32, 35, 38, 42, 51, 62,  31, 32, 32, 34, 37, 41, 50, 61,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 33, 34, 37, 41, 49, 59,
+          31, 32, 34, 35, 38, 42, 49, 59,  32, 32, 34, 36, 38, 42, 50, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          32, 33, 35, 37, 40, 42, 49, 58,  33, 33, 36, 40, 43, 46, 53, 62,
+          34, 34, 37, 41, 44, 48, 54, 63,  34, 34, 37, 43, 46, 50, 56, 65,
+          36, 34, 38, 46, 50, 54, 60, 68,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  39, 37, 40, 48, 53, 58, 65, 73,
+          41, 39, 41, 49, 54, 60, 67, 76,  44, 41, 43, 51, 57, 63, 71, 79,
+          44, 41, 43, 51, 57, 63, 71, 79,  47, 44, 45, 53, 59, 66, 75, 84,
+          48, 45, 46, 54, 60, 67, 76, 85,  50, 46, 47, 55, 61, 68, 78, 88,
+          53, 49, 50, 57, 64, 71, 82, 92,  53, 49, 50, 57, 64, 71, 82, 92,
+          57, 53, 53, 60, 67, 74, 86, 97,  58, 54, 54, 61, 68, 75, 87, 98,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105,
+          65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55,
+          31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53,
+          30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52,
+          33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52,
+          42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54,
+          49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+          48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66,
+          50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68,
+          52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71,
+          54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76,
+          57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52,
+          31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49,
+          31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49,
+          32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57,
+          36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+          38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65,
+          39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69,
+          44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72,
+          47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76,
+          49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81,
+          53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83,
+          58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50,
+          31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48,
+          31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47,
+          33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47,
+          37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52,
+          49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+          48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60,
+          49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64,
+          51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66,
+          52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67,
+          54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46,
+          31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45,
+          32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+          34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+          35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56,
+          36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58,
+          39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70,
+          48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49,
+          31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46,
+          39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+          42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+          47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55,
+          48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61,
+          50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+          34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50,
+          36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56,
+          39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60,
+          44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46,
+          30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45,
+          33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+          42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51,
+          49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57,
+          49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40,
+          33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48,
+          36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48,
+          41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50,
+          42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50,
+          44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35,
+          32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37,
+          34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39,
+          31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45,
+          35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47,
+          42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35,
+          31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37,
+          30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38,
+          31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32,
+          30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix16x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = {
+        {{32,  31,  32,  34,  36,  44,  53,  59,  65,  79,  87,  90,  93,  96,
+          99,  102, 31,  32,  32,  34,  35,  42,  51,  56,  62,  75,  82,  85,
+          88,  91,  94,  97,  31,  32,  33,  33,  34,  41,  49,  54,  59,  72,
+          78,  82,  86,  90,  93,  97,  31,  32,  33,  34,  35,  41,  49,  54,
+          59,  71,  78,  81,  84,  87,  90,  93,  32,  32,  34,  35,  36,  42,
+          50,  54,  59,  71,  77,  80,  82,  86,  89,  93,  32,  33,  35,  37,
+          38,  42,  49,  53,  58,  69,  75,  78,  82,  86,  89,  92,  34,  34,
+          37,  39,  42,  48,  54,  58,  63,  73,  79,  78,  80,  83,  88,  92,
+          35,  34,  37,  41,  45,  50,  57,  61,  65,  76,  82,  83,  84,  84,
+          87,  90,  36,  34,  38,  43,  48,  54,  60,  64,  68,  78,  84,  87,
+          86,  89,  90,  90,  39,  37,  40,  45,  50,  58,  65,  69,  73,  84,
+          89,  89,  91,  91,  93,  96,  44,  41,  43,  48,  53,  63,  71,  75,
+          79,  90,  95,  93,  94,  95,  97,  97,  46,  43,  44,  49,  55,  65,
+          73,  78,  82,  93,  98,  100, 98,  100, 99,  103, 48,  45,  46,  51,
+          56,  67,  76,  80,  85,  96,  102, 102, 105, 102, 105, 104, 53,  49,
+          50,  54,  60,  71,  82,  87,  92,  103, 109, 107, 107, 110, 107, 111,
+          58,  54,  54,  58,  63,  75,  87,  92,  98,  110, 116, 115, 112, 111,
+          115, 112, 61,  57,  56,  60,  66,  77,  89,  95,  101, 114, 120, 118,
+          119, 118, 116, 120, 65,  60,  58,  63,  68,  79,  92,  98,  105, 118,
+          124, 123, 122, 123, 124, 121, 71,  65,  63,  68,  73,  84,  97,  103,
+          111, 125, 132, 132, 130, 128, 127, 130, 79,  72,  70,  74,  79,  90,
+          104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81,  74,  71,  75,
+          80,  91,  105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82,  75,
+          72,  76,  81,  92,  106, 113, 121, 136, 144, 151, 149, 149, 146, 143,
+          88,  80,  77,  80,  85,  97,  108, 115, 126, 142, 149, 153, 153, 152,
+          152, 154, 91,  83,  80,  81,  88,  100, 106, 114, 130, 142, 148, 155,
+          162, 160, 159, 155, 94,  85,  83,  82,  91,  100, 105, 118, 131, 137,
+          153, 160, 165, 167, 166, 168, 97,  88,  86,  85,  94,  100, 107, 123,
+          128, 140, 157, 161, 167, 173, 171, 169, 100, 91,  89,  87,  97,  100,
+          111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94,  93,  90,
+          98,  101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97,
+          96,  93,  100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198,
+          110, 101, 100, 97,  101, 108, 117, 123, 138, 141, 161, 165, 183, 188,
+          193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167,
+          185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151,
+          157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136,
+          136, 156, 156, 178, 179, 203, 204, 217},
+         {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67,  68,  69,  71,  72,
+          31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64,  65,  66,  67,  68,
+          30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62,  63,  65,  66,  68,
+          32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61,  62,  63,  64,  65,
+          33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60,  60,  62,  63,  65,
+          37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58,  60,  61,  62,  63,
+          42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,  59,  60,  62,  63,
+          45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61,  61,  60,  61,  61,
+          49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63,  62,  63,  62,  62,
+          48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65,  65,  64,  64,  65,
+          49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67,  66,  66,  66,  65,
+          49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70,  68,  68,  67,  68,
+          50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72,  72,  70,  70,  69,
+          52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74,  73,  73,  71,  72,
+          54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78,  76,  74,  75,  73,
+          55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,  79,  78,  76,  77,
+          57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82,  80,  80,  79,  77,
+          60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86,  84,  82,  81,  81,
+          63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88,  87,  85,  84,  81,
+          64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90,  89,  87,  86,  86,
+          64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95,  93,  91,  89,  87,
+          67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96,  94,  92,  91,  91,
+          68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96,  98,  96,  94,  91,
+          69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98,  99,  98,  97,  96,
+          70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,  99,  101, 98,  97,
+          71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98,  102, 102, 102, 101,
+          72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102,
+          73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98,  102, 105, 106, 107,
+          74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96,  103, 105, 106, 107,
+          75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96,  103, 105, 109, 109,
+          76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97,  101, 107, 109, 110,
+          77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99,  99,  108, 108, 113}},
+        {{32,  31,  32,  32,  36,  44,  47,  53,  65,  73,  79,  87,  90,  93,
+          96,  99,  31,  32,  32,  33,  35,  42,  45,  51,  62,  69,  75,  83,
+          86,  88,  91,  94,  31,  32,  32,  33,  35,  41,  44,  49,  60,  67,
+          72,  80,  84,  87,  90,  94,  31,  32,  33,  33,  35,  41,  44,  49,
+          59,  66,  71,  79,  82,  84,  87,  90,  32,  32,  34,  34,  36,  42,
+          45,  50,  59,  65,  71,  78,  80,  83,  87,  90,  32,  33,  35,  36,
+          38,  42,  45,  49,  58,  64,  69,  76,  80,  83,  86,  88,  32,  33,
+          35,  36,  40,  44,  47,  51,  60,  66,  71,  76,  78,  81,  85,  89,
+          34,  34,  36,  38,  42,  48,  50,  54,  63,  69,  73,  80,  82,  81,
+          84,  86,  36,  34,  37,  40,  48,  54,  56,  60,  68,  74,  78,  84,
+          83,  86,  87,  87,  38,  36,  39,  41,  49,  56,  58,  63,  71,  77,
+          81,  86,  88,  88,  90,  93,  39,  37,  40,  42,  50,  58,  60,  65,
+          73,  79,  84,  90,  91,  92,  94,  93,  44,  41,  42,  45,  53,  63,
+          66,  71,  79,  85,  90,  96,  94,  96,  96,  99,  47,  44,  45,  47,
+          56,  66,  69,  75,  84,  90,  95,  99,  101, 98,  101, 99,  49,  46,
+          47,  48,  57,  67,  71,  77,  86,  93,  97,  103, 103, 105, 102, 106,
+          53,  49,  50,  51,  60,  71,  75,  82,  92,  99,  103, 111, 108, 107,
+          110, 107, 58,  54,  54,  55,  63,  75,  79,  87,  98,  105, 110, 114,
+          114, 113, 111, 115, 61,  56,  56,  57,  65,  77,  81,  89,  100, 107,
+          113, 118, 116, 117, 118, 116, 65,  60,  59,  60,  68,  79,  84,  92,
+          105, 112, 118, 126, 124, 122, 121, 124, 71,  65,  64,  65,  73,  84,
+          89,  97,  111, 119, 125, 130, 129, 129, 129, 125, 76,  69,  68,  69,
+          76,  88,  92,  101, 115, 123, 130, 134, 134, 131, 132, 135, 79,  72,
+          70,  71,  79,  90,  95,  104, 118, 127, 133, 143, 142, 141, 138, 136,
+          82,  75,  73,  74,  81,  92,  97,  106, 121, 130, 136, 146, 145, 144,
+          144, 145, 86,  78,  76,  77,  84,  95,  100, 109, 124, 133, 140, 147,
+          153, 151, 150, 146, 89,  81,  79,  78,  87,  95,  99,  112, 124, 130,
+          145, 152, 156, 157, 156, 158, 92,  84,  82,  80,  89,  95,  101, 116,
+          121, 132, 148, 151, 157, 163, 161, 159, 95,  86,  85,  83,  92,  95,
+          105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98,  89,  88,  85,
+          93,  95,  108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92,
+          91,  88,  94,  98,  110, 112, 128, 133, 146, 158, 166, 175, 179, 185,
+          104, 95,  94,  91,  95,  101, 110, 115, 129, 132, 151, 154, 171, 175,
+          181, 186, 107, 98,  97,  94,  96,  105, 110, 119, 128, 136, 149, 156,
+          173, 177, 188, 192, 110, 101, 100, 97,  98,  108, 111, 123, 127, 141,
+          147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126,
+          127, 145, 145, 166, 166, 189, 190, 201},
+         {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68,  69,  70,  71,
+          31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64,  65,  66,  67,
+          30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63,  64,  65,  67,
+          31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61,  62,  63,  64,
+          33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60,  61,  63,  64,
+          37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59,  61,  62,  62,
+          39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58,  59,  61,  62,
+          42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60,  59,  60,  60,
+          49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61,  62,  61,  61,
+          48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64,  63,  63,  64,
+          48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65,  65,  65,  64,
+          49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67,  67,  66,  67,
+          50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71,  68,  69,  67,
+          51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72,  72,  70,  71,
+          52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75,  73,  74,  71,
+          54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78,  76,  74,  75,
+          55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79,  78,  78,  75,
+          57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83,  81,  79,  79,
+          60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85,  84,  82,  80,
+          62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87,  85,  84,  84,
+          63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91,  89,  87,  84,
+          64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92,  90,  89,  89,
+          66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95,  93,  91,  89,
+          67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96,  96,  94,  94,
+          68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97,  98,  96,  94,
+          69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99,  99,  100, 98,
+          70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98,  100, 101, 99,
+          71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99,  102, 103, 104,
+          72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104,
+          73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106,
+          74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98,  103, 105, 106,
+          75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96,  105, 105, 109}},
+        {{32,  31,  32,  32,  36,  39,  44,  53,  58,  65,  79,  81,  88,  90,
+          93,  96,  31,  32,  32,  32,  35,  38,  42,  51,  55,  62,  75,  77,
+          83,  86,  88,  91,  31,  32,  32,  32,  35,  38,  41,  50,  54,  60,
+          73,  75,  81,  84,  88,  91,  31,  32,  32,  33,  34,  37,  41,  49,
+          53,  59,  72,  74,  79,  82,  84,  87,  32,  32,  33,  34,  36,  39,
+          42,  50,  53,  59,  71,  72,  78,  81,  84,  87,  32,  32,  34,  34,
+          37,  40,  42,  49,  53,  58,  70,  71,  77,  80,  83,  85,  32,  33,
+          34,  35,  38,  40,  42,  49,  52,  58,  69,  70,  76,  78,  82,  86,
+          34,  34,  35,  37,  42,  45,  48,  54,  57,  63,  73,  75,  79,  79,
+          81,  83,  34,  34,  36,  37,  44,  47,  50,  56,  59,  65,  75,  77,
+          81,  83,  84,  84,  36,  34,  37,  38,  48,  51,  54,  60,  63,  68,
+          78,  80,  85,  85,  86,  89,  39,  37,  39,  40,  50,  54,  58,  65,
+          68,  73,  84,  85,  88,  89,  90,  89,  40,  38,  40,  41,  51,  55,
+          59,  67,  70,  75,  85,  87,  91,  92,  92,  95,  44,  41,  42,  43,
+          53,  58,  63,  71,  74,  79,  90,  91,  97,  94,  97,  95,  47,  44,
+          45,  46,  56,  61,  66,  75,  79,  85,  95,  97,  99,  101, 98,  102,
+          49,  46,  46,  47,  57,  62,  67,  77,  81,  86,  97,  99,  104, 102,
+          105, 102, 53,  49,  50,  50,  60,  65,  71,  82,  86,  92,  103, 105,
+          109, 108, 106, 110, 57,  53,  53,  53,  63,  68,  74,  86,  90,  97,
+          108, 110, 111, 112, 113, 110, 59,  54,  54,  54,  64,  69,  75,  87,
+          91,  98,  111, 112, 119, 117, 115, 118, 65,  60,  59,  58,  68,  73,
+          79,  92,  97,  105, 118, 119, 123, 123, 122, 119, 69,  63,  62,  62,
+          71,  76,  83,  96,  100, 109, 122, 124, 127, 125, 125, 128, 71,  65,
+          64,  63,  73,  78,  84,  97,  102, 111, 125, 127, 135, 134, 131, 129,
+          79,  72,  71,  70,  79,  84,  90,  104, 109, 118, 133, 135, 137, 136,
+          136, 137, 81,  74,  72,  71,  80,  85,  91,  105, 110, 120, 135, 137,
+          145, 143, 141, 138, 82,  75,  73,  72,  81,  86,  92,  106, 111, 121,
+          136, 139, 147, 148, 147, 149, 87,  79,  77,  76,  85,  90,  96,  110,
+          114, 125, 140, 143, 148, 154, 151, 149, 90,  82,  80,  78,  87,  89,
+          99,  108, 113, 129, 135, 146, 153, 157, 160, 159, 92,  84,  83,  81,
+          88,  90,  102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95,  87,
+          85,  83,  88,  92,  103, 105, 120, 125, 137, 148, 155, 164, 168, 173,
+          98,  89,  88,  85,  89,  95,  103, 108, 121, 124, 141, 144, 160, 164,
+          169, 174, 100, 92,  91,  88,  90,  98,  103, 111, 120, 127, 139, 146,
+          161, 165, 175, 179, 103, 94,  94,  90,  92,  101, 103, 114, 119, 131,
+          137, 150, 158, 170, 175, 180, 106, 97,  97,  93,  93,  104, 104, 118,
+          118, 135, 135, 154, 155, 175, 176, 187},
+         {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68,  69,  69,
+          31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64,  65,  66,
+          31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63,  65,  66,
+          30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61,  62,  63,
+          33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60,  62,  63,
+          35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60,  61,  61,
+          37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58,  60,  61,
+          42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,  59,  59,
+          44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61,  61,  60,
+          49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62,  62,  63,
+          48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64,  64,  63,
+          48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66,  65,  66,
+          49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67,  68,  66,
+          50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71,  68,  70,
+          51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71,  72,  70,
+          52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75,  73,  73,
+          54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,  76,  74,
+          54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79,  77,  78,
+          57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82,  81,  78,
+          59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83,  82,  82,
+          60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87,  85,  82,
+          63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88,  87,  86,
+          64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91,  89,  87,
+          64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93,  92,  91,
+          66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95,  93,  91,
+          67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,  97,  95,
+          68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97,  98,  96,
+          69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99,  100, 101,
+          70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99,  100, 101,
+          71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99,  102, 103,
+          72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103,
+          73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}},
+        {{32,  31,  31,  32,  35,  36,  44,  47,  53,  62,  65,  79,  82,  88,
+          90,  93,  31,  32,  32,  32,  35,  35,  42,  45,  51,  59,  62,  75,
+          78,  83,  86,  88,  31,  32,  32,  32,  34,  35,  41,  45,  50,  58,
+          61,  74,  76,  82,  85,  88,  31,  32,  32,  33,  34,  34,  41,  44,
+          49,  57,  59,  72,  74,  79,  82,  84,  31,  32,  33,  34,  35,  36,
+          42,  44,  49,  57,  59,  71,  73,  79,  81,  84,  32,  32,  33,  34,
+          36,  36,  42,  45,  50,  57,  59,  71,  73,  78,  80,  82,  32,  33,
+          34,  35,  37,  38,  42,  45,  49,  56,  58,  69,  71,  76,  79,  83,
+          32,  33,  34,  36,  39,  40,  44,  47,  51,  58,  60,  71,  73,  76,
+          78,  80,  34,  34,  35,  37,  41,  42,  48,  50,  54,  61,  63,  73,
+          76,  81,  81,  80,  35,  34,  36,  38,  45,  47,  52,  55,  59,  65,
+          67,  77,  79,  82,  83,  86,  36,  34,  36,  38,  46,  48,  54,  56,
+          60,  66,  68,  78,  80,  85,  87,  86,  39,  37,  39,  40,  48,  50,
+          58,  60,  65,  71,  73,  84,  86,  89,  88,  91,  41,  39,  40,  41,
+          49,  51,  60,  62,  67,  74,  76,  86,  88,  91,  93,  91,  44,  41,
+          42,  43,  51,  53,  63,  66,  71,  78,  79,  90,  92,  97,  94,  97,
+          47,  44,  44,  45,  53,  56,  66,  69,  75,  82,  84,  95,  97,  98,
+          101, 98,  48,  45,  45,  46,  54,  56,  67,  70,  76,  83,  85,  96,
+          98,  104, 101, 105, 53,  49,  50,  50,  57,  60,  71,  75,  82,  90,
+          92,  103, 106, 107, 108, 105, 55,  51,  51,  51,  59,  61,  72,  77,
+          84,  92,  94,  106, 108, 111, 110, 112, 58,  54,  54,  54,  61,  63,
+          75,  79,  87,  95,  98,  110, 112, 117, 116, 113, 63,  58,  58,  57,
+          65,  67,  78,  83,  91,  100, 103, 116, 118, 119, 119, 121, 65,  60,
+          59,  58,  66,  68,  79,  84,  92,  102, 105, 118, 120, 127, 124, 122,
+          71,  65,  64,  63,  71,  73,  84,  89,  97,  108, 111, 125, 127, 129,
+          129, 130, 74,  68,  67,  66,  73,  75,  86,  91,  100, 110, 113, 128,
+          131, 135, 134, 130, 79,  72,  71,  70,  77,  79,  90,  95,  104, 115,
+          118, 133, 136, 140, 139, 140, 82,  75,  73,  72,  79,  81,  92,  97,
+          105, 117, 120, 136, 139, 145, 142, 140, 82,  75,  74,  72,  79,  81,
+          92,  97,  106, 117, 121, 136, 139, 148, 150, 149, 87,  79,  78,  76,
+          83,  85,  96,  100, 110, 120, 125, 141, 144, 148, 153, 150, 89,  82,
+          81,  78,  83,  87,  97,  99,  113, 118, 128, 139, 145, 153, 157, 161,
+          92,  84,  83,  80,  84,  89,  97,  101, 114, 116, 132, 135, 150, 153,
+          157, 162, 94,  86,  85,  82,  85,  92,  97,  104, 112, 119, 130, 136,
+          151, 154, 163, 166, 97,  88,  88,  85,  86,  94,  97,  107, 111, 123,
+          128, 140, 147, 159, 163, 167, 99,  91,  91,  87,  87,  97,  97,  110,
+          110, 126, 126, 144, 144, 163, 163, 173},
+         {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
+          31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
+          34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
+          40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
+          46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
+          47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
+          45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
+          47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
+          50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
+          55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
+          57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
+          64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
+          66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
+          70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
+          71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
+          72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
+          53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
+          50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+          51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
+          50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
+          57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
+          59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
+          67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
+          71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
+          75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
+          82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
+          85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
+          88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
+          94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
+          97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
+          98, 102}},
+        {{32,  31,  31,  32,  33,  36,  40,  44,  51,  53,  65,  66,  79,  81,
+          87,  90,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,  62,  63,
+          75,  77,  83,  85,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,
+          61,  62,  74,  76,  82,  85,  31,  32,  32,  33,  33,  34,  38,  41,
+          47,  49,  59,  60,  72,  74,  79,  81,  31,  32,  32,  33,  34,  35,
+          38,  41,  47,  49,  59,  60,  71,  73,  79,  81,  32,  32,  33,  34,
+          35,  36,  39,  42,  48,  50,  59,  60,  71,  72,  78,  80,  32,  32,
+          33,  35,  36,  37,  40,  42,  48,  49,  58,  59,  69,  71,  77,  80,
+          32,  33,  33,  35,  36,  38,  41,  42,  48,  49,  58,  59,  69,  70,
+          75,  77,  33,  33,  34,  36,  38,  41,  44,  46,  52,  53,  62,  63,
+          72,  74,  78,  78,  34,  34,  34,  37,  39,  42,  45,  48,  53,  54,
+          63,  64,  73,  75,  80,  83,  36,  34,  35,  38,  42,  48,  51,  54,
+          58,  60,  68,  69,  78,  80,  84,  83,  36,  35,  35,  38,  42,  48,
+          51,  54,  59,  60,  68,  69,  79,  80,  85,  87,  39,  37,  38,  40,
+          44,  50,  54,  58,  63,  65,  73,  74,  84,  85,  89,  88,  40,  38,
+          39,  41,  45,  51,  56,  59,  65,  67,  75,  76,  85,  87,  90,  93,
+          44,  41,  41,  43,  46,  53,  59,  63,  69,  71,  79,  80,  90,  91,
+          96,  93,  46,  43,  43,  44,  48,  55,  60,  65,  72,  73,  82,  83,
+          93,  94,  97,  100, 48,  45,  45,  46,  50,  56,  62,  67,  74,  76,
+          85,  86,  96,  98,  103, 100, 52,  48,  48,  49,  52,  59,  65,  70,
+          78,  80,  90,  91,  101, 103, 105, 107, 53,  49,  49,  50,  53,  60,
+          66,  71,  79,  82,  92,  93,  103, 105, 111, 107, 58,  53,  53,  53,
+          57,  63,  69,  74,  83,  86,  97,  98,  109, 111, 113, 115, 58,  54,
+          54,  54,  57,  63,  70,  75,  84,  87,  98,  99,  110, 112, 118, 115,
+          65,  60,  59,  58,  62,  68,  74,  79,  89,  92,  105, 106, 118, 119,
+          122, 123, 66,  61,  60,  59,  63,  69,  75,  80,  90,  93,  106, 107,
+          119, 121, 126, 123, 71,  65,  65,  63,  67,  73,  79,  84,  94,  97,
+          111, 112, 125, 127, 131, 132, 74,  68,  67,  66,  69,  75,  81,  86,
+          97,  100, 113, 115, 128, 130, 134, 132, 79,  72,  72,  70,  73,  79,
+          85,  90,  101, 104, 118, 119, 133, 135, 141, 140, 81,  74,  73,  71,
+          75,  80,  86,  91,  102, 105, 120, 121, 135, 137, 143, 140, 82,  75,
+          74,  72,  75,  81,  87,  92,  103, 106, 121, 122, 136, 139, 147, 151,
+          86,  78,  78,  75,  78,  84,  90,  95,  106, 109, 124, 125, 140, 142,
+          147, 151, 88,  81,  80,  77,  80,  86,  90,  98,  105, 112, 122, 127,
+          140, 144, 152, 155, 91,  83,  82,  79,  80,  88,  90,  100, 103, 114,
+          119, 130, 137, 148, 151, 155, 93,  85,  85,  81,  81,  90,  90,  102,
+          103, 117, 117, 134, 134, 151, 152, 160},
+         {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
+          31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
+          33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
+          40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
+          43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
+          47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
+          46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
+          45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
+          49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
+          50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
+          57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
+          57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
+          64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
+          65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
+          69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
+          70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
+          51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
+          48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+          49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
+          49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
+          52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
+          57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
+          62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
+          66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
+          73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
+          75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
+          83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
+          85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
+          91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
+          94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
+          95, 98}},
+        {{32,  31,  31,  32,  32,  36,  36,  44,  44,  53,  53,  65,  65,  79,
+          79,  87,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,  51,  62,
+          62,  75,  75,  82,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,
+          51,  62,  62,  75,  75,  82,  31,  32,  32,  33,  33,  34,  34,  41,
+          41,  49,  49,  59,  59,  72,  72,  78,  31,  32,  32,  33,  33,  34,
+          34,  41,  41,  49,  49,  59,  59,  72,  72,  78,  32,  32,  32,  34,
+          34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,  32,  32,
+          32,  34,  34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,
+          32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,  58,  69,
+          69,  75,  32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,
+          58,  69,  69,  75,  34,  34,  34,  37,  37,  42,  42,  48,  48,  54,
+          54,  63,  63,  73,  73,  79,  34,  34,  34,  37,  37,  42,  42,  48,
+          48,  54,  54,  63,  63,  73,  73,  79,  36,  34,  34,  38,  38,  48,
+          48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  36,  34,  34,  38,
+          38,  48,  48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  39,  37,
+          37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,  84,  89,
+          39,  37,  37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,
+          84,  89,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,  71,  79,
+          79,  90,  90,  95,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,
+          71,  79,  79,  90,  90,  95,  48,  45,  45,  46,  46,  56,  56,  67,
+          67,  76,  76,  85,  85,  96,  96,  102, 48,  45,  45,  46,  46,  56,
+          56,  67,  67,  76,  76,  85,  85,  96,  96,  102, 53,  49,  49,  50,
+          50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109, 53,  49,
+          49,  50,  50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109,
+          58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,  98,  110,
+          110, 116, 58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,
+          98,  110, 110, 116, 65,  60,  60,  58,  58,  68,  68,  79,  79,  92,
+          92,  105, 105, 118, 118, 124, 65,  60,  60,  58,  58,  68,  68,  79,
+          79,  92,  92,  105, 105, 118, 118, 124, 71,  65,  65,  63,  63,  73,
+          73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 71,  65,  65,  63,
+          63,  73,  73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 79,  72,
+          72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133, 133, 141,
+          79,  72,  72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133,
+          133, 141, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106, 106, 121,
+          121, 136, 136, 144, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106,
+          106, 121, 121, 136, 136, 144, 87,  79,  79,  76,  76,  84,  84,  96,
+          96,  109, 109, 124, 124, 141, 141, 149},
+         {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
+          31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
+          31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
+          40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
+          40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
+          47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
+          47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
+          45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
+          45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
+          50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
+          50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
+          57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
+          57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
+          64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
+          64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
+          69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
+          50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
+          46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+          48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
+          47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
+          49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
+          55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
+          56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
+          64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
+          65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
+          72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
+          75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
+          82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
+          83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
+          90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
+          92, 95}},
+        {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53,  53,  62,  65,  73,  79,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  60,  62,  70,  75,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  59,  62,  69,  75,
+          31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50,  50,  58,  60,  67,  73,
+          31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49,  49,  57,  59,  66,  72,
+          31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49,  49,  57,  59,  66,  71,
+          32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50,  50,  57,  59,  65,  71,
+          32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49,  49,  56,  59,  65,  70,
+          32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49,  49,  56,  58,  64,  69,
+          32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50,  50,  56,  58,  64,  69,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59,  59,  65,  67,  73,  77,
+          36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60,  60,  66,  68,  74,  78,
+          38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63,  63,  69,  71,  77,  81,
+          39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65,  65,  71,  73,  79,  84,
+          41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67,  67,  74,  76,  81,  86,
+          44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71,  71,  78,  79,  85,  90,
+          44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72,  72,  79,  81,  86,  91,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81,  81,  89,  91,  98,  103,
+          53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82,  82,  90,  92,  99,  103,
+          57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85,  85,  94,  96,  103, 108,
+          58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,  87,  95,  98,  105, 110,
+          61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89,  89,  98,  101, 108, 114,
+          65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92,  92,  102, 105, 112, 118,
+          67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94,  94,  103, 106, 114, 120,
+          71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97,  97,  108, 111, 119, 125,
+          72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98,  98,  108, 111, 119, 125,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133},
+         {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
+          31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
+          31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
+          37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
+          40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
+          44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
+          47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
+          47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
+          45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
+          46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
+          50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
+          50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
+          55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
+          57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
+          61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
+          64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
+          49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
+          46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+          46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
+          46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
+          47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
+          50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
+          55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
+          57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
+          63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
+          64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
+          71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
+          72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
+          78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
+          82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
+          86, 89}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58,  65,  65,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56,  63,  63,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55,  62,  62,
+          31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54,  61,  61,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56,  62,  62,
+          34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57,  63,  63,
+          34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59,  65,  65,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,  72,  72,
+          39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68,  73,  73,
+          41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70,  76,  76,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78,  84,  84,
+          48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79,  85,  85,
+          50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82,  88,  88,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90,  97,  97,
+          58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91,  98,  98,
+          61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93,  100, 100,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109},
+         {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
+          31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
+          31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
+          34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
+          40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
+          40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
+          46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
+          47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
+          46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
+          49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
+          50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
+          53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
+          57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
+          57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
+          48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
+          46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+          45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
+          45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
+          46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
+          46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
+          52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
+          54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
+          57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
+          62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
+          62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
+          68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
+          70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
+          73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
+          78, 78}},
+        {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
+          32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
+          32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
+          32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
+          32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
+          33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
+          33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
+          36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
+          38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
+          42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
+          42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
+          47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
+          54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
+          54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
+          61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+          36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
+          37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+          37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
+          38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
+          42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
+          43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
+          45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
+          54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
+          56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
+          60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
+          71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
+          71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
+          76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
+          87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
+          87, 92},
+         {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
+          31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
+          31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
+          32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
+          38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
+          40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
+          41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
+          46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
+          47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
+          47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
+          47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
+          50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
+          50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
+          54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+          49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
+          47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+          46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
+          46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
+          46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
+          46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
+          47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
+          53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
+          54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
+          55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
+          61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
+          61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
+          63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
+          68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
+          68, 71}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
+          31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
+          32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
+          32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
+          32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
+          33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
+          35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
+          36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
+          40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
+          42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
+          42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
+          47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
+          54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
+          34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
+          35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+          34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
+          34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
+          37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
+          40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
+          40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
+          43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
+          51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
+          53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
+          54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
+          61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
+          67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
+          67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
+          73, 79},
+         {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
+          31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
+          31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
+          31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
+          34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
+          40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
+          40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
+          41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
+          46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
+          47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
+          47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+          47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
+          50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
+          43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
+          46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+          46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
+          46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
+          46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
+          47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
+          47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
+          47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
+          51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
+          53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
+          53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
+          57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
+          59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
+          59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
+          62, 65}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+          36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
+          36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
+          36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
+          40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+          42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
+          42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+          33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
+          34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+          34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
+          34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
+          34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
+          36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
+          38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
+          38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
+          39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
+          45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
+          50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
+          50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
+          52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
+          58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
+          63, 63},
+         {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
+          31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
+          31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
+          31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+          31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
+          35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
+          40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
+          40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
+          40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
+          44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
+          47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
+          47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
+          47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
+          39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
+          42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+          43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
+          43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
+          44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
+          47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
+          48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
+          48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
+          47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
+          50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
+          53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
+          53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
+          53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
+          56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
+          58, 58}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
+          36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
+          32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+          32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
+          33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
+          33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
+          35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
+          36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
+          37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
+          38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
+          42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
+          46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
+          48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
+          49, 49},
+         {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
+          31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
+          31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
+          31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
+          31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
+          31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
+          34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
+          38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
+          40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
+          40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
+          40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
+          43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
+          46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
+          47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
+          47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
+          47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
+          36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
+          38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+          39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
+          40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
+          41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
+          42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
+          44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
+          46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
+          47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
+          48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
+          50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
+          52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
+          53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
+          53, 53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
+          34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
+          33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+          34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+          35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
+          36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
+          38, 39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
+          31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
+          31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
+          31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
+          31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
+          31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+          31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
+          32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
+          34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
+          37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
+          40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
+          40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
+          40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
+          40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
+          41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
+          44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
+          33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
+          34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+          35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
+          35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
+          37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
+          38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
+          39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
+          41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
+          43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
+          45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
+          47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
+          47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
+          47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+          47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+          48, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          34, 34},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
+          34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+          36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
+          38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
+          40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+          30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
+          32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
+          34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
+          36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
+          36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+          36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
+          37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
+          38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
+          40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
+          42, 44}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32}}};
+constexpr uint8_t
+    kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
+                       [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200},
+                                {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}},
+                               {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184},
+                                {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}},
+                               {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169},
+                                {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}},
+                               {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156},
+                                {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}},
+                               {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140},
+                                {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}},
+                               {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134},
+                                {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}},
+                               {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108},
+                                {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}},
+                               {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92},
+                                {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}},
+                               {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81},
+                                {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}},
+                               {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65},
+                                {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}},
+                               {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54},
+                                {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}},
+                               {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46},
+                                {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}},
+                               {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35},
+                                {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}},
+                               {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33},
+                                {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}},
+                               {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32},
+                                {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix8x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = {
+        {{32,  32,  35,  38,  40,  54,  51,  49,  65,  82,  68,  63,
+          78,  97,  117, 84,  76,  91,  111, 134, 152, 95,  89,  98,
+          113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220},
+         {31, 38, 47, 47,  46, 54, 50, 47, 57, 66, 57,  52,
+          61, 72, 82, 63,  57, 66, 77, 88, 96, 67, 62,  67,
+          75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}},
+        {{32,  32,  35,  37,  39,  51, 47,  46,  60,  73,  62,  58,
+          71,  87,  105, 78,  72,  84, 100, 121, 140, 90,  84,  93,
+          106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201},
+         {31, 38, 47, 47,  47, 53, 48, 46, 55, 62, 54,  50,
+          58, 67, 76, 61,  55, 63, 72, 83, 91, 66, 61,  65,
+          73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}},
+        {{32,  32,  34,  35,  37, 48, 46, 45,  56,  70,  57,  54,
+          64,  80,  93,  76,  70, 79, 96, 111, 134, 85,  79,  87,
+          100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184},
+         {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49,
+          55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63,
+          71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}},
+        {{32, 32,  33,  35,  36, 46, 42, 42,  52,  63,  53,  51,
+          60, 73,  86,  68,  64, 72, 84, 100, 117, 78,  74,  80,
+          92, 109, 128, 140, 90, 84, 87, 98,  114, 133, 155, 168},
+         {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48,
+          54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61,
+          68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}},
+        {{32, 32,  33,  34,  35, 39, 39, 40, 46,  56,  50,  48,
+          53, 65,  78,  62,  59, 63, 75, 90, 105, 76,  71,  74,
+          86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47,
+          50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57,
+          65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}},
+        {{32, 32, 33,  34,  35, 39, 38, 39, 45, 54,  46,  45,
+          51, 61, 71,  56,  54, 58, 69, 80, 92, 68,  64,  68,
+          78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46,
+          50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55,
+          61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}},
+        {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42,  41,
+          43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56,  56,
+          66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119},
+         {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45,
+          45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49,
+          56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}},
+        {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37,
+          40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49,
+          56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105},
+         {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45,
+          46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47,
+          52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}},
+        {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34,
+          37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45,
+          51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85},
+         {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46,
+          47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46,
+          50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}},
+        {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34,
+          35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42,
+          42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71},
+         {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43,
+          45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45,
+          45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}},
+        {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33,
+          34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37,
+          38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58},
+         {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41,
+          42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46,
+          47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}},
+        {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32,
+          32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33,
+          35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48},
+         {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37,
+          39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42,
+          45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32,
+          32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38},
+         {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34,
+          35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39,
+          40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix32x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = {
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  33,
+          33,  32,  32,  32,  33,  34,  35,  34,  34,  33,  34,  35,  37,  39,
+          35,  34,  34,  35,  36,  37,  41,  43,  36,  35,  34,  35,  36,  38,
+          42,  45,  48,  39,  38,  37,  38,  39,  40,  45,  47,  50,  54,  44,
+          42,  41,  41,  42,  42,  47,  50,  54,  58,  63,  46,  44,  42,  43,
+          44,  44,  49,  52,  55,  59,  65,  67,  48,  46,  44,  45,  45,  46,
+          51,  53,  57,  61,  67,  69,  71,  54,  51,  49,  49,  50,  49,  54,
+          57,  60,  65,  71,  74,  76,  82,  59,  56,  54,  54,  54,  53,  58,
+          61,  64,  69,  75,  78,  80,  87,  92,  62,  59,  56,  56,  56,  55,
+          60,  63,  66,  71,  77,  80,  83,  89,  95,  98,  65,  62,  59,  59,
+          59,  58,  63,  65,  68,  73,  79,  82,  85,  92,  98,  101, 105, 71,
+          68,  65,  64,  64,  63,  68,  70,  73,  78,  84,  87,  90,  97,  103,
+          107, 111, 117, 80,  76,  72,  72,  71,  69,  74,  76,  79,  84,  90,
+          93,  96,  104, 110, 114, 118, 125, 134, 81,  77,  73,  73,  72,  70,
+          75,  77,  80,  85,  91,  94,  97,  105, 111, 115, 119, 126, 135, 137,
+          83,  78,  75,  74,  74,  72,  76,  79,  81,  86,  92,  95,  99,  106,
+          113, 117, 121, 128, 137, 138, 140, 88,  84,  80,  79,  78,  76,  80,
+          82,  85,  91,  95,  98,  103, 111, 115, 119, 126, 134, 139, 144, 147,
+          152, 91,  86,  83,  82,  81,  79,  81,  84,  88,  92,  95,  100, 107,
+          110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94,  89,  86,  85,
+          84,  82,  82,  86,  90,  92,  97,  103, 105, 111, 119, 121, 128, 136,
+          139, 146, 156, 158, 161, 166, 97,  92,  90,  88,  86,  85,  84,  89,
+          91,  95,  100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163,
+          166, 168, 174, 101, 95,  93,  91,  89,  89,  87,  91,  93,  98,  101,
+          105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176,
+          183, 104, 99,  97,  94,  93,  93,  90,  92,  96,  100, 102, 108, 111,
+          116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+          107, 102, 101, 97,  96,  96,  93,  93,  99,  101, 105, 110, 113, 120,
+          122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+          111, 105, 104, 101, 100, 99,  97,  96,  102, 103, 109, 111, 117, 120,
+          125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202,
+          210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119,
+          121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204,
+          210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112,
+          117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193,
+          197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107,
+          107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176,
+          177, 190, 191, 204, 206, 222, 224, 230, 232, 242},
+         {32,  31,  31,  30,  31,  32,  32,  33,  33,  35,  33,  34,  35,  37,
+          39,  36,  38,  40,  41,  43,  47,  41,  42,  42,  43,  45,  47,  48,
+          45,  45,  44,  45,  46,  47,  49,  50,  49,  47,  46,  47,  47,  48,
+          50,  51,  53,  48,  47,  45,  46,  46,  46,  49,  51,  53,  54,  49,
+          47,  45,  45,  45,  45,  49,  51,  53,  55,  58,  50,  47,  45,  46,
+          46,  46,  49,  51,  54,  56,  59,  60,  50,  48,  46,  46,  46,  46,
+          50,  52,  54,  56,  60,  60,  61,  52,  50,  47,  47,  47,  47,  50,
+          52,  54,  57,  61,  62,  63,  66,  54,  52,  49,  49,  49,  48,  52,
+          53,  55,  58,  62,  64,  65,  68,  71,  56,  53,  51,  50,  50,  49,
+          52,  54,  56,  59,  63,  64,  66,  69,  72,  73,  57,  54,  52,  51,
+          51,  50,  53,  55,  56,  60,  63,  65,  67,  70,  73,  75,  76,  60,
+          57,  54,  54,  53,  52,  55,  57,  58,  61,  65,  67,  68,  72,  75,
+          77,  79,  82,  63,  60,  57,  57,  56,  54,  57,  59,  60,  63,  67,
+          69,  71,  75,  78,  80,  82,  85,  89,  64,  61,  58,  57,  57,  55,
+          58,  59,  61,  64,  67,  69,  71,  75,  78,  80,  82,  85,  89,  90,
+          65,  61,  58,  58,  57,  55,  58,  60,  61,  64,  68,  70,  71,  75,
+          79,  81,  83,  86,  90,  91,  91,  67,  63,  61,  60,  59,  57,  60,
+          61,  63,  66,  69,  70,  73,  77,  79,  81,  85,  88,  90,  92,  94,
+          96,  68,  64,  62,  61,  60,  58,  59,  61,  64,  66,  67,  71,  74,
+          75,  78,  82,  84,  86,  90,  93,  94,  96,  98,  69,  65,  63,  62,
+          61,  59,  59,  62,  64,  65,  68,  71,  72,  75,  79,  80,  83,  87,
+          89,  92,  96,  97,  98,  100, 70,  66,  64,  63,  62,  61,  60,  63,
+          64,  66,  69,  70,  73,  76,  77,  81,  84,  85,  89,  92,  93,  98,
+          99,  100, 102, 71,  67,  66,  64,  63,  62,  61,  63,  64,  67,  68,
+          70,  74,  75,  78,  81,  83,  86,  88,  91,  94,  95,  100, 101, 102,
+          104, 72,  68,  67,  65,  64,  64,  61,  63,  65,  67,  68,  71,  73,
+          75,  78,  79,  84,  85,  88,  91,  93,  97,  98,  102, 103, 104, 106,
+          73,  69,  68,  66,  65,  65,  63,  63,  66,  67,  69,  71,  73,  76,
+          77,  81,  82,  85,  88,  90,  94,  95,  99,  101, 104, 105, 106, 109,
+          74,  70,  70,  67,  66,  66,  64,  63,  66,  67,  70,  71,  74,  75,
+          78,  80,  82,  86,  87,  91,  92,  96,  98,  101, 104, 106, 108, 108,
+          111, 75,  71,  71,  68,  68,  67,  66,  64,  66,  68,  70,  71,  74,
+          75,  79,  79,  84,  84,  88,  90,  93,  95,  98,  101, 103, 107, 108,
+          110, 111, 113, 76,  72,  72,  69,  69,  68,  67,  65,  66,  69,  70,
+          72,  74,  76,  78,  81,  83,  85,  88,  90,  93,  95,  98,  100, 104,
+          105, 109, 111, 112, 113, 116, 78,  74,  74,  70,  70,  69,  69,  66,
+          66,  70,  70,  74,  74,  77,  78,  82,  82,  86,  87,  92,  92,  96,
+          97,  102, 102, 107, 107, 112, 113, 115, 115, 118}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  33,  34,  35,  32,  33,  33,  33,  34,  36,  36,
+          34,  34,  33,  34,  35,  37,  38,  39,  36,  35,  34,  35,  36,  38,
+          40,  42,  48,  38,  37,  36,  36,  38,  39,  41,  44,  50,  51,  39,
+          38,  37,  38,  39,  40,  42,  45,  50,  52,  54,  44,  42,  41,  41,
+          42,  42,  44,  47,  54,  56,  58,  63,  47,  45,  44,  44,  45,  45,
+          47,  50,  56,  58,  60,  66,  69,  49,  47,  46,  45,  46,  46,  48,
+          51,  57,  60,  62,  68,  71,  73,  54,  51,  50,  49,  50,  49,  51,
+          54,  60,  63,  65,  71,  75,  77,  82,  59,  56,  54,  54,  54,  53,
+          55,  58,  64,  67,  69,  75,  79,  81,  87,  92,  61,  58,  56,  56,
+          56,  55,  57,  60,  65,  68,  70,  77,  81,  83,  89,  94,  97,  65,
+          62,  60,  59,  59,  58,  60,  63,  68,  71,  73,  79,  84,  87,  92,
+          98,  101, 105, 71,  68,  65,  65,  64,  63,  65,  68,  73,  76,  78,
+          84,  89,  92,  97,  103, 106, 111, 117, 76,  72,  70,  69,  68,  66,
+          68,  71,  76,  79,  81,  88,  92,  95,  101, 107, 110, 115, 122, 127,
+          80,  76,  73,  72,  71,  69,  71,  74,  79,  82,  84,  90,  95,  98,
+          104, 110, 113, 118, 125, 130, 134, 83,  78,  76,  75,  74,  72,  73,
+          76,  81,  84,  86,  92,  97,  100, 106, 113, 116, 121, 128, 133, 137,
+          140, 86,  82,  79,  78,  77,  74,  76,  79,  84,  87,  89,  95,  100,
+          103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89,  85,  82,  81,
+          79,  78,  78,  82,  86,  87,  92,  97,  100, 105, 112, 114, 120, 128,
+          131, 136, 146, 147, 150, 155, 92,  88,  85,  84,  82,  81,  80,  85,
+          86,  90,  95,  97,  102, 107, 110, 117, 122, 125, 134, 138, 142, 152,
+          154, 156, 162, 95,  90,  88,  86,  85,  84,  82,  86,  88,  93,  95,
+          99,  105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163,
+          169, 98,  93,  91,  89,  88,  87,  85,  87,  90,  94,  96,  102, 104,
+          109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+          101, 96,  95,  92,  91,  90,  88,  88,  93,  95,  99,  103, 106, 112,
+          114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+          104, 99,  98,  95,  94,  93,  91,  90,  95,  96,  102, 103, 109, 112,
+          117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186,
+          193, 108, 102, 101, 98,  97,  96,  95,  93,  97,  100, 104, 106, 111,
+          113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188,
+          192, 194, 201, 111, 105, 105, 101, 100, 99,  98,  96,  98,  103, 105,
+          109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
+          181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99,
+          100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162,
+          162, 175, 176, 187, 188, 203, 204, 210, 211, 219},
+         {32,  31,  31,  30, 31, 31,  31, 32, 32, 33,  33,  34,  35,  36,  39,
+          36,  38,  39,  40, 43, 47,  38, 40, 41, 41,  44,  47,  47,  41,  42,
+          42,  43,  45,  47, 48, 48,  49, 47, 46, 46,  47,  48,  49,  50,  53,
+          49,  47,  46,  46, 46, 47,  48, 50, 53, 53,  48,  47,  46,  45,  46,
+          46,  48,  49,  53, 54, 54,  49, 47, 45, 45,  45,  45,  47,  49,  53,
+          55,  55,  58,  50, 48, 46,  46, 46, 46, 47,  50,  54,  55,  56,  59,
+          61,  51,  48,  47, 46, 47,  46, 47, 50, 54,  55,  56,  60,  61,  62,
+          52,  50,  48,  47, 47, 47,  48, 50, 54, 56,  57,  61,  63,  64,  66,
+          54,  52,  50,  49, 49, 48,  49, 52, 55, 57,  58,  62,  64,  66,  68,
+          71,  55,  53,  51, 50, 50,  49, 50, 52, 56,  58,  59,  63,  65,  66,
+          69,  72,  73,  57, 54, 52,  51, 51, 50, 51,  53,  56,  58,  60,  63,
+          66,  67,  70,  73, 74, 76,  60, 57, 55, 54,  53,  52,  53,  55,  58,
+          60,  61,  65,  68, 69, 72,  75, 77, 79, 82,  62,  59,  57,  56,  55,
+          53,  54,  56,  59, 61, 63,  66, 69, 70, 74,  77,  78,  80,  84,  86,
+          63,  60,  58,  57, 56, 54,  55, 57, 60, 62,  63,  67,  70,  71,  75,
+          78,  79,  82,  85, 87, 89,  65, 61, 59, 58,  57,  55,  56,  58,  61,
+          63,  64,  68,  71, 72, 75,  79, 80, 83, 86,  88,  90,  91,  66,  63,
+          60,  59,  58,  56, 58, 59,  62, 64, 65, 69,  72,  73,  76,  80,  81,
+          84,  87,  90,  91, 93, 94,  67, 64, 62, 61,  59,  58,  58,  60,  63,
+          64,  66,  69,  71, 73, 77,  78, 81, 85, 86,  89,  93,  94,  95,  97,
+          68,  65,  63,  62, 60, 59,  58, 61, 62, 64,  67,  68,  71,  74,  75,
+          79,  81,  83,  87, 89, 91,  95, 96, 97, 99,  69,  66,  64,  63,  61,
+          61,  59,  61,  62, 65, 66,  68, 72, 73, 76,  78,  80,  84,  85,  88,
+          91,  92,  97,  98, 98, 101, 70, 67, 65, 63,  62,  62,  60,  61,  63,
+          65,  66,  69,  71, 73, 76,  77, 81, 83, 85,  88,  90,  94,  95,  99,
+          100, 100, 103, 71, 67, 67,  64, 63, 63, 61,  61,  64,  65,  67,  69,
+          71,  74,  75,  78, 80, 83,  85, 87, 91, 92,  95,  97,  100, 102, 102,
+          105, 72,  68,  68, 65, 65,  64, 62, 62, 64,  65,  68,  69,  72,  73,
+          76,  78,  80,  83, 84, 88,  89, 93, 95, 97,  100, 102, 104, 104, 107,
+          73,  69,  69,  66, 66, 65,  64, 63, 64, 66,  68,  69,  72,  73,  77,
+          77,  81,  82,  86, 87, 90,  92, 95, 97, 99,  103, 104, 106, 106, 109,
+          74,  70,  70,  67, 67, 66,  65, 63, 64, 67,  68,  70,  72,  74,  76,
+          78,  80,  82,  85, 87, 90,  91, 95, 96, 100, 101, 105, 106, 108, 108,
+          111, 75,  71,  71, 68, 68,  66, 66, 64, 64,  68,  68,  71,  71,  75,
+          75,  79,  79,  83, 84, 88,  89, 93, 93, 98,  98,  102, 103, 108, 108,
+          110, 110, 113}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  32,  33,  34,  32,  32,  32,  32,  34,  34,  35,
+          34,  34,  33,  33,  35,  36,  37,  39,  34,  34,  34,  34,  36,  36,
+          37,  41,  42,  36,  35,  34,  34,  36,  37,  38,  42,  45,  48,  39,
+          38,  38,  37,  39,  40,  40,  45,  47,  50,  54,  41,  39,  39,  38,
+          40,  40,  41,  46,  48,  51,  55,  56,  44,  42,  41,  41,  42,  42,
+          42,  47,  50,  54,  58,  59,  63,  48,  46,  45,  44,  45,  45,  45,
+          50,  53,  56,  61,  62,  66,  70,  49,  47,  46,  45,  46,  46,  46,
+          51,  53,  57,  62,  63,  68,  71,  73,  54,  51,  50,  49,  50,  49,
+          49,  54,  56,  60,  65,  67,  71,  76,  77,  82,  58,  55,  54,  53,
+          53,  53,  52,  57,  59,  63,  68,  70,  74,  79,  81,  86,  90,  59,
+          57,  55,  54,  54,  54,  54,  59,  61,  64,  69,  71,  75,  80,  82,
+          87,  91,  93,  65,  62,  60,  59,  59,  58,  58,  63,  65,  68,  73,
+          75,  79,  85,  87,  92,  97,  99,  105, 69,  66,  64,  63,  63,  62,
+          61,  66,  68,  71,  76,  78,  83,  88,  90,  96,  100, 102, 109, 113,
+          71,  68,  66,  65,  64,  63,  63,  68,  70,  73,  78,  80,  84,  90,
+          92,  97,  102, 104, 111, 115, 117, 80,  76,  73,  72,  71,  70,  69,
+          74,  76,  79,  84,  86,  90,  96,  98,  104, 109, 111, 118, 123, 125,
+          134, 81,  77,  75,  74,  73,  72,  71,  75,  77,  80,  85,  87,  91,
+          97,  99,  105, 110, 112, 120, 125, 127, 136, 137, 83,  78,  76,  75,
+          74,  73,  72,  76,  78,  81,  86,  88,  92,  98,  100, 106, 111, 113,
+          121, 126, 128, 137, 139, 140, 87,  83,  81,  79,  78,  77,  75,  80,
+          82,  85,  90,  91,  96,  101, 103, 110, 114, 117, 125, 129, 133, 142,
+          143, 145, 150, 90,  85,  83,  81,  80,  79,  78,  81,  83,  87,  89,
+          93,  98,  100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+          156, 93,  88,  86,  84,  83,  82,  80,  82,  85,  89,  90,  96,  98,
+          102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+          95,  90,  89,  86,  85,  85,  83,  83,  88,  89,  93,  97,  99,  105,
+          106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+          98,  93,  92,  89,  88,  87,  86,  85,  89,  90,  96,  97,  102, 105,
+          109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170,
+          176, 101, 96,  95,  91,  91,  90,  89,  87,  90,  93,  97,  99,  104,
+          105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172,
+          176, 177, 184, 104, 99,  98,  94,  94,  92,  92,  90,  92,  96,  98,
+          102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163,
+          166, 177, 179, 184, 185, 191, 107, 101, 101, 97,  97,  95,  95,  93,
+          93,  99,  99,  105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149,
+          149, 161, 161, 172, 172, 185, 186, 191, 192, 199},
+         {32,  31,  31, 30, 31, 31, 30, 31, 31, 32, 33, 34,  35,  35,  39,
+          35,  36,  37, 37, 41, 43, 36, 38, 39, 40, 43, 45,  47,  41,  42,
+          42,  42,  45, 46, 47, 48, 44, 44, 44, 44, 46, 46,  47,  49,  50,
+          49,  47,  47, 46, 47, 47, 48, 50, 51, 53, 48, 47,  46,  45,  46,
+          46,  46,  49, 51, 53, 54, 48, 47, 46, 45, 46, 46,  46,  49,  51,
+          53,  54,  55, 49, 47, 46, 45, 45, 45, 45, 49, 51,  53,  55,  56,
+          58,  50,  48, 47, 46, 46, 46, 46, 50, 51, 54, 56,  57,  59,  61,
+          51,  48,  47, 46, 47, 46, 46, 50, 51, 54, 56, 57,  60,  62,  62,
+          52,  50,  48, 47, 47, 47, 47, 50, 52, 54, 57, 58,  61,  63,  64,
+          66,  54,  51, 50, 49, 49, 48, 48, 51, 53, 55, 58,  59,  62,  64,
+          65,  68,  70, 55, 52, 51, 50, 49, 49, 48, 52, 53,  55,  59,  60,
+          62,  65,  66, 68, 70, 71, 57, 54, 53, 52, 51, 50,  50,  53,  54,
+          56,  60,  61, 63, 66, 67, 70, 73, 73, 76, 59, 56,  54,  53,  53,
+          52,  51,  54, 56, 58, 61, 62, 65, 68, 69, 72, 74,  75,  78,  80,
+          60,  57,  55, 54, 53, 53, 52, 55, 56, 58, 61, 63,  65,  68,  69,
+          72,  75,  76, 79, 81, 82, 63, 60, 58, 57, 56, 55,  54,  57,  59,
+          60,  63,  65, 67, 70, 71, 75, 77, 78, 82, 84, 85,  89,  64,  61,
+          59,  58,  57, 56, 55, 58, 59, 61, 64, 65, 68, 71,  72,  75,  78,
+          79,  82,  85, 86, 89, 90, 65, 61, 60, 58, 57, 56,  55,  58,  59,
+          61,  64,  65, 68, 71, 72, 75, 78, 79, 83, 85, 86,  90,  91,  91,
+          67,  63,  61, 60, 59, 58, 57, 60, 61, 63, 65, 66,  69,  72,  73,
+          77,  79,  80, 84, 86, 88, 92, 93, 93, 95, 68, 64,  63,  61,  60,
+          59,  58,  60, 61, 63, 65, 67, 70, 71, 74, 76, 78,  81,  83,  86,
+          88,  89,  94, 94, 95, 97, 68, 65, 64, 62, 61, 60,  58,  59,  61,
+          64,  64,  68, 69, 71, 74, 75, 79, 80, 83, 86, 87,  91,  92,  95,
+          96,  97,  99, 69, 66, 65, 63, 62, 61, 59, 59, 62,  63,  65,  67,
+          69,  72,  72, 76, 78, 80, 83, 84, 88, 89, 92, 94,  97,  98,  99,
+          101, 70,  67, 66, 63, 63, 62, 61, 60, 63, 63, 66,  67,  69,  71,
+          73,  76,  77, 81, 82, 85, 86, 90, 91, 94, 96, 99,  100, 100, 103,
+          71,  67,  67, 64, 64, 63, 62, 61, 62, 64, 66, 67,  70,  71,  74,
+          74,  78,  79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+          72,  68,  68, 65, 65, 64, 63, 61, 62, 65, 66, 68,  69,  71,  73,
+          75,  77,  79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104,
+          106, 73,  69, 69, 66, 66, 64, 64, 62, 62, 66, 66,  69,  69,  72,
+          73,  76,  77, 81, 81, 85, 85, 89, 90, 94, 94, 99,  99,  104, 104,
+          106, 106, 108}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  34,  35,
+          32,  33,  33,  33,  34,  34,  36,  36,  34,  34,  34,  33,  35,  35,
+          37,  38,  39,  35,  35,  34,  34,  36,  36,  38,  39,  42,  46,  36,
+          35,  35,  34,  36,  36,  38,  40,  42,  47,  48,  39,  38,  38,  37,
+          39,  39,  40,  42,  45,  49,  50,  54,  41,  40,  39,  38,  40,  40,
+          41,  43,  46,  50,  52,  55,  57,  44,  42,  42,  41,  42,  42,  42,
+          44,  47,  52,  54,  58,  60,  63,  47,  45,  45,  44,  44,  45,  45,
+          47,  50,  55,  56,  60,  62,  66,  69,  48,  46,  45,  44,  45,  45,
+          46,  47,  51,  55,  57,  61,  63,  67,  70,  71,  54,  51,  50,  49,
+          49,  50,  49,  51,  54,  59,  60,  65,  67,  71,  75,  76,  82,  56,
+          53,  52,  51,  51,  51,  51,  53,  56,  60,  61,  66,  69,  73,  77,
+          78,  84,  86,  59,  56,  55,  54,  54,  54,  53,  55,  58,  62,  64,
+          69,  71,  75,  79,  80,  87,  89,  92,  64,  61,  60,  58,  58,  58,
+          57,  59,  62,  66,  67,  72,  75,  79,  83,  84,  91,  93,  97,  102,
+          65,  62,  61,  59,  59,  59,  58,  60,  63,  67,  68,  73,  75,  79,
+          84,  85,  92,  94,  98,  103, 105, 71,  68,  67,  65,  64,  64,  63,
+          65,  68,  72,  73,  78,  80,  84,  89,  90,  97,  100, 103, 109, 111,
+          117, 74,  71,  69,  68,  67,  67,  65,  67,  70,  74,  75,  80,  83,
+          86,  91,  93,  100, 102, 106, 112, 114, 120, 123, 80,  76,  74,  72,
+          71,  71,  69,  71,  74,  78,  79,  84,  86,  90,  95,  96,  104, 106,
+          110, 116, 118, 125, 128, 134, 82,  78,  76,  74,  73,  73,  71,  73,
+          76,  79,  80,  86,  88,  92,  97,  98,  106, 108, 112, 118, 120, 127,
+          131, 136, 139, 83,  78,  77,  75,  74,  74,  72,  73,  76,  80,  81,
+          86,  89,  92,  97,  99,  106, 109, 113, 119, 121, 128, 131, 137, 139,
+          140, 87,  83,  81,  79,  78,  78,  75,  77,  80,  83,  85,  90,  92,
+          96,  100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+          90,  85,  84,  81,  80,  80,  78,  78,  82,  84,  87,  91,  93,  98,
+          99,  106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+          92,  88,  87,  84,  83,  82,  80,  80,  84,  85,  90,  91,  95,  98,
+          102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156,
+          162, 95,  90,  89,  86,  85,  84,  83,  82,  85,  87,  91,  92,  97,
+          98,  105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158,
+          161, 162, 168, 97,  92,  92,  88,  88,  86,  86,  84,  85,  90,  91,
+          95,  97,  101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150,
+          152, 162, 164, 168, 168, 174, 100, 95,  95,  90,  90,  89,  89,  86,
+          86,  92,  92,  97,  98,  104, 104, 111, 111, 119, 119, 128, 129, 137,
+          137, 147, 148, 157, 158, 169, 170, 174, 175, 181},
+         {32,  31,  31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34,  37,
+          33,  34,  35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38,  40,
+          40,  41,  43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48,  48,
+          47,  46,  46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46,  47,
+          47,  48,  49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48,  49,
+          52,  53,  54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53,  55,
+          55,  49,  47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57,  58,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,  61,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60,  61,
+          61,  52,  50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59,  61,
+          63,  63,  66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55,  58,
+          59,  62,  64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49,  52,
+          55,  55,  58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51,  51,
+          51,  49,  51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73,  75,
+          57,  54,  53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63,  66,
+          67,  70,  71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53,  55,
+          58,  58,  61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61,  58,
+          57,  55,  55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,  73,
+          74,  76,  79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55,  57,
+          60,  60,  63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86,  89,
+          64,  61,  60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68,  70,
+          71,  75,  77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58,  57,
+          57,  55,  56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79,  82,
+          83,  86,  88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58,  60,
+          62,  63,  66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89,  92,
+          93,  93,  95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63,  65,
+          67,  70,  70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94,  95,
+          97,  68,  65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67,  69,
+          71,  73,  75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97,  99,
+          69,  65,  65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68,  72,
+          72,  76,  76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98,  100,
+          70,  66,  66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69,  71,
+          73,  75,  77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100,
+          102, 71,  67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,  70,
+          70,  74,  74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100,
+          101, 101, 104}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  33,  34,
+          32,  32,  32,  32,  33,  34,  35,  35,  33,  33,  33,  33,  34,  35,
+          36,  36,  38,  34,  34,  34,  33,  34,  35,  36,  37,  39,  39,  36,
+          35,  35,  34,  35,  36,  37,  38,  42,  42,  48,  36,  35,  35,  34,
+          35,  36,  38,  38,  42,  43,  48,  49,  39,  38,  38,  37,  38,  39,
+          40,  40,  44,  45,  50,  51,  54,  41,  39,  39,  38,  39,  40,  40,
+          41,  45,  46,  51,  52,  55,  56,  44,  42,  42,  41,  41,  42,  42,
+          42,  46,  47,  54,  54,  58,  59,  63,  46,  44,  44,  42,  43,  44,
+          44,  44,  48,  49,  55,  55,  59,  61,  65,  67,  48,  46,  46,  44,
+          45,  45,  45,  46,  50,  51,  57,  57,  61,  63,  67,  69,  71,  52,
+          50,  49,  48,  48,  48,  48,  48,  52,  53,  59,  59,  64,  65,  70,
+          72,  74,  78,  54,  51,  51,  49,  49,  50,  49,  49,  53,  54,  60,
+          60,  65,  67,  71,  74,  76,  80,  82,  58,  56,  55,  53,  53,  53,
+          53,  53,  57,  58,  63,  64,  68,  70,  75,  77,  80,  84,  86,  91,
+          59,  56,  56,  54,  54,  54,  53,  53,  57,  58,  64,  64,  69,  70,
+          75,  78,  80,  85,  87,  91,  92,  65,  62,  61,  59,  59,  59,  58,
+          58,  62,  63,  68,  68,  73,  75,  79,  82,  85,  90,  92,  97,  98,
+          105, 66,  63,  63,  60,  60,  60,  59,  59,  63,  64,  69,  69,  74,
+          76,  80,  83,  86,  91,  93,  98,  99,  106, 107, 71,  68,  67,  65,
+          65,  64,  63,  63,  67,  68,  73,  73,  78,  80,  84,  87,  90,  95,
+          97,  103, 103, 111, 112, 117, 74,  71,  70,  68,  67,  67,  66,  65,
+          69,  70,  75,  75,  80,  82,  86,  89,  93,  97,  100, 105, 106, 114,
+          115, 120, 123, 80,  76,  75,  72,  72,  71,  70,  69,  73,  74,  79,
+          79,  84,  86,  90,  93,  96,  101, 104, 110, 110, 118, 119, 125, 128,
+          134, 81,  77,  77,  74,  73,  73,  71,  71,  74,  75,  80,  80,  85,
+          87,  91,  94,  98,  103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+          83,  78,  78,  75,  74,  74,  72,  72,  75,  76,  81,  81,  86,  88,
+          92,  95,  99,  104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+          86,  82,  81,  78,  77,  77,  75,  74,  78,  79,  84,  84,  89,  91,
+          95,  98,  101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144,
+          147, 89,  84,  84,  80,  80,  79,  78,  77,  79,  81,  85,  86,  91,
+          92,  97,  98,  104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145,
+          148, 149, 153, 91,  86,  86,  82,  82,  81,  80,  79,  80,  84,  85,
+          88,  91,  94,  97,  100, 104, 107, 112, 115, 120, 123, 129, 132, 138,
+          140, 148, 150, 153, 154, 159, 93,  88,  88,  84,  84,  83,  83,  80,
+          81,  86,  86,  91,  91,  96,  97,  103, 103, 110, 110, 118, 119, 126,
+          126, 135, 136, 144, 144, 155, 155, 159, 159, 164},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34,
+          35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43,
+          46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45,
+          46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47,
+          47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46,
+          46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53,
+          53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56,
+          58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+          50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+          52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
+          65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62,
+          63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59,
+          62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55,
+          56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50,
+          50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55,
+          54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71,
+          73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61,
+          63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54,
+          53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80,
+          83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67,
+          69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57,
+          56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83,
+          86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64,
+          65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63,
+          62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76,
+          79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58,
+          57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
+          89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62,
+          64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94,
+          96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68,
+          68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97,
+          99}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  33,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  34,  34,  35,  32,  32,  32,  32,  32,  34,
+          34,  35,  35,  34,  34,  34,  33,  33,  35,  35,  37,  37,  39,  34,
+          34,  34,  33,  33,  35,  35,  37,  37,  39,  39,  36,  35,  35,  34,
+          34,  36,  36,  38,  38,  42,  42,  48,  36,  35,  35,  34,  34,  36,
+          36,  38,  38,  42,  42,  48,  48,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  54,  44,  42,  42,  41,  41,  42,
+          42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  44,  42,  42,  41,
+          41,  42,  42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  63,  48,
+          46,  46,  44,  44,  45,  45,  46,  46,  51,  51,  57,  57,  61,  61,
+          67,  67,  71,  48,  46,  46,  44,  44,  45,  45,  46,  46,  51,  51,
+          57,  57,  61,  61,  67,  67,  71,  71,  54,  51,  51,  49,  49,  50,
+          50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,
+          54,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,
+          65,  71,  71,  76,  76,  82,  82,  59,  56,  56,  54,  54,  54,  54,
+          53,  53,  58,  58,  64,  64,  69,  69,  75,  75,  80,  80,  87,  87,
+          92,  59,  56,  56,  54,  54,  54,  54,  53,  53,  58,  58,  64,  64,
+          69,  69,  75,  75,  80,  80,  87,  87,  92,  92,  65,  62,  62,  59,
+          59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,
+          85,  92,  92,  98,  98,  105, 65,  62,  62,  59,  59,  59,  59,  58,
+          58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,
+          98,  105, 105, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,
+          73,  73,  78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111,
+          117, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,  73,  73,
+          78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111, 117, 117,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          134, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,  81,  81,
+          86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121, 128, 128,
+          137, 137, 140, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,
+          81,  81,  86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121,
+          128, 128, 137, 137, 140, 140, 87,  83,  83,  79,  79,  77,  77,  75,
+          75,  80,  80,  84,  84,  90,  90,  96,  96,  102, 102, 109, 109, 116,
+          116, 124, 124, 132, 132, 141, 141, 144, 144, 149},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34,
+          34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43,
+          43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45,
+          45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47,
+          47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47,
+          48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
+          53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+          54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+          50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
+          61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60,
+          60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57,
+          57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50,
+          54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49,
+          48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52,
+          52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65,
+          68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56,
+          60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51,
+          51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73,
+          76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61,
+          65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53,
+          53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75,
+          79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60,
+          63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60,
+          60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71,
+          75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57,
+          55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
+          83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58,
+          61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90,
+          90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66,
+          66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93,
+          95}},
+        {{32,  31,  31,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  32,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  33,  33,  34,  32,  32,  32,  32,  32,  33,
+          34,  34,  35,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  34,
+          34,  34,  33,  33,  34,  35,  35,  37,  37,  39,  34,  34,  34,  33,
+          33,  34,  35,  35,  37,  37,  39,  39,  35,  35,  35,  34,  34,  35,
+          36,  36,  38,  38,  42,  42,  46,  36,  35,  35,  34,  34,  35,  36,
+          37,  38,  38,  42,  42,  47,  48,  38,  37,  37,  36,  36,  37,  38,
+          38,  39,  40,  44,  44,  48,  50,  51,  39,  38,  38,  38,  37,  38,
+          39,  39,  40,  41,  45,  45,  49,  50,  52,  54,  41,  40,  40,  39,
+          38,  39,  40,  40,  41,  41,  46,  46,  50,  52,  54,  55,  57,  44,
+          42,  42,  41,  41,  41,  42,  42,  42,  43,  47,  47,  52,  54,  56,
+          58,  60,  63,  45,  43,  43,  42,  41,  42,  42,  43,  43,  43,  48,
+          48,  53,  54,  57,  58,  60,  64,  65,  48,  46,  46,  45,  44,  45,
+          45,  45,  46,  46,  51,  51,  55,  57,  59,  61,  63,  67,  68,  71,
+          48,  46,  46,  45,  44,  45,  45,  45,  46,  46,  51,  51,  55,  57,
+          59,  61,  63,  67,  68,  71,  71,  53,  51,  51,  49,  49,  49,  49,
+          49,  49,  49,  54,  54,  58,  59,  62,  64,  67,  71,  72,  75,  75,
+          81,  54,  52,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,
+          60,  63,  65,  67,  71,  72,  76,  76,  81,  82,  57,  55,  55,  53,
+          52,  52,  52,  52,  52,  52,  57,  57,  61,  62,  65,  67,  70,  74,
+          75,  79,  79,  85,  85,  89,  59,  56,  56,  54,  54,  54,  54,  54,
+          53,  54,  58,  58,  62,  64,  67,  69,  71,  75,  76,  80,  80,  86,
+          87,  90,  92,  62,  59,  59,  57,  56,  56,  56,  56,  55,  56,  60,
+          60,  64,  66,  69,  71,  73,  77,  78,  83,  83,  89,  89,  93,  95,
+          98,  65,  62,  62,  60,  59,  59,  59,  59,  58,  58,  63,  63,  67,
+          68,  71,  73,  75,  79,  81,  85,  85,  91,  92,  96,  98,  101, 105,
+          67,  64,  64,  62,  61,  61,  60,  60,  59,  60,  64,  64,  68,  69,
+          72,  74,  77,  81,  82,  87,  87,  93,  94,  98,  99,  103, 106, 108,
+          71,  68,  68,  66,  65,  64,  64,  64,  63,  63,  68,  68,  72,  73,
+          76,  78,  80,  84,  85,  90,  90,  97,  97,  102, 103, 107, 111, 113,
+          117, 72,  69,  69,  66,  65,  65,  65,  64,  63,  64,  68,  68,  72,
+          73,  76,  78,  81,  85,  86,  91,  91,  97,  98,  102, 104, 108, 111,
+          113, 118, 119, 80,  76,  76,  73,  72,  72,  71,  70,  69,  70,  74,
+          74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103, 104, 108, 110,
+          114, 118, 120, 125, 126, 134, 80,  76,  76,  73,  72,  72,  71,  70,
+          69,  70,  74,  74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103,
+          104, 108, 110, 114, 118, 120, 125, 126, 134, 134},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32,
+          33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38,
+          40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42,
+          43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42,
+          42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47,
+          47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50,
+          50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
+          53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+          49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
+          58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56,
+          57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54,
+          55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50,
+          50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47,
+          47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50,
+          50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61,
+          63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54,
+          55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49,
+          49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68,
+          70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58,
+          59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51,
+          51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70,
+          72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56,
+          57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57,
+          57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66,
+          68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54,
+          53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
+          76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
+          57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
+          85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60,
+          62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+          89}},
+        {{32,  31,  31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32,  32,  32,
+          31,  32,  32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33,  31,  32,
+          32,  32,  32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33,  33,  34,
+          32,  32,  32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32,  32,  32,
+          32,  33,  34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34,  35,  35,
+          36,  36,  38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37,  37,  39,
+          39,  34,  34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40,  41,  42,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          48,  38,  38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43,  44,  46,
+          50,  50,  52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40,  40,  44,
+          45,  47,  50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40,  40,  40,
+          41,  41,  45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42,  42,  41,
+          41,  42,  42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58,  60,  63,
+          44,  42,  42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47,  50,  54,
+          54,  57,  58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44,  45,  45,
+          45,  45,  49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69,  48,  47,
+          46,  45,  44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57,  57,  60,
+          61,  63,  67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47,  47,  47,
+          47,  47,  51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72,  73,  75,
+          54,  52,  51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54,  56,  60,
+          60,  64,  65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51,  50,  49,
+          49,  49,  50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65,  67,  71,
+          71,  75,  76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53,  53,  53,
+          52,  52,  56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78,  79,  82,
+          86,  86,  90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53,  53,  57,
+          58,  60,  64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87,  87,  91,
+          92,  61,  59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59,  60,  62,
+          65,  65,  69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93,  94,  97,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          105, 70,  67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66,  67,  69,
+          72,  72,  76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105,
+          109, 109, 114},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31,
+          31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35,
+          38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40,
+          42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41,
+          41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44,
+          45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47,
+          49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
+          53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
+          54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53,
+          54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51,
+          53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45,
+          48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46,
+          46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49,
+          48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58,
+          60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50,
+          51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47,
+          47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63,
+          65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54,
+          54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49,
+          49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65,
+          66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52,
+          53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53,
+          53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60,
+          63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51,
+          51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
+          70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50,
+          52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73,
+          74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56,
+          58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78,
+          80}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33,
+          34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
+          37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+          39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47,
+          50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45,
+          45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40,
+          40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40,
+          41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43,
+          42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56,
+          58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45,
+          48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44,
+          44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66,
+          66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51,
+          53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45,
+          45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68,
+          68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51,
+          54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52,
+          51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63,
+          65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50,
+          51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
+          76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
+          53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
+          87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58,
+          58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+          92},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35,
+          35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36,
+          37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38,
+          38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41,
+          43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+          48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51,
+          53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49,
+          49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+          46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45,
+          45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47,
+          47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55,
+          55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46,
+          46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59,
+          59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50,
+          52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46,
+          46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60,
+          60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48,
+          50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50,
+          50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56,
+          57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48,
+          48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
+          63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
+          48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
+          68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52,
+          52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+          71}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+          36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+          34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+          34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
+          41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42,
+          42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+          40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36,
+          36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39,
+          38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40,
+          40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38,
+          38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55,
+          55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44,
+          47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41,
+          41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58,
+          58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43,
+          43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46,
+          45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55,
+          56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44,
+          45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
+          63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45,
+          45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68,
+          70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48,
+          50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74,
+          77},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35,
+          35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36,
+          37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40,
+          40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+          46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+          41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+          43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
+          49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+          50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48,
+          49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47,
+          47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46,
+          46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47,
+          47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52,
+          53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+          46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45,
+          45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55,
+          55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45,
+          45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55,
+          55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49,
+          48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53,
+          54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46,
+          46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
+          58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
+          46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60,
+          61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46,
+          48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63,
+          64}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36,
+          37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35,
+          35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34,
+          34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35,
+          35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42,
+          42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35,
+          34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+          48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39,
+          39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38,
+          37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39,
+          40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39,
+          38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45,
+          45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39,
+          39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
+          52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
+          42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
+          58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
+          42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+          63},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31,
+          32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+          33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34,
+          34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35,
+          35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41,
+          41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+          47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+          36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+          39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46,
+          47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45,
+          45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44,
+          44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48,
+          47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50,
+          50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+          47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47,
+          46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+          53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47,
+          47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53,
+          53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
+          46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48,
+          47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49,
+          49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
+          53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+          45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
+          55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+          58}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+          34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+          34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35,
+          35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39,
+          39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35,
+          35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40,
+          41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35,
+          35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38,
+          38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35,
+          34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
+          42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34,
+          35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47,
+          48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37,
+          37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49,
+          50},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31,
+          31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+          37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+          39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+          34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+          36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
+          46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+          45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+          43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40,
+          41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40,
+          41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41,
+          41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44,
+          45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42,
+          42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48,
+          48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45,
+          45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44,
+          44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49,
+          49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+          47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48,
+          48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48,
+          48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47,
+          46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+          50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+          46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52,
+          53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
+          47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
+          53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
+          33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
+          38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+          39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+          33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
+          39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37,
+          38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+          36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+          35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35,
+          36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36,
+          36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41,
+          41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39,
+          39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40,
+          41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40,
+          40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39,
+          39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44,
+          44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41,
+          41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
+          47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
+          48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+          48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+          35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+          37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
+          35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
+          38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+          35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+          39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40,
+          40}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+          30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32}}};
diff --git a/src/quantizer_test.cc b/src/quantizer_test.cc
new file mode 100644 (file)
index 0000000..0c27027
--- /dev/null
@@ -0,0 +1,220 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(QuantizerTest, GetQIndex) {
+  const int kBaseQIndex = 40;
+  const int kDelta = 10;
+  const int kOutOfRangeIndex = 200;
+  Segmentation segmentation = {};
+
+  EXPECT_EQ(GetQIndex(segmentation, 0, kBaseQIndex), kBaseQIndex);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+
+  segmentation.enabled = true;
+  EXPECT_EQ(GetQIndex(segmentation, 0, kBaseQIndex), kBaseQIndex);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+
+  segmentation.feature_enabled[1][kSegmentFeatureQuantizer] = true;
+  segmentation.feature_data[1][kSegmentFeatureQuantizer] = kDelta;
+  EXPECT_EQ(GetQIndex(segmentation, 1, kBaseQIndex), kBaseQIndex + kDelta);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+
+  segmentation.enabled = false;
+  EXPECT_EQ(GetQIndex(segmentation, 1, kBaseQIndex), kBaseQIndex);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+}
+
+TEST(QuantizerTest, GetDcValue) {
+  QuantizerParameters params = {};
+  params.delta_dc[kPlaneY] = 1;
+  params.delta_dc[kPlaneU] = 2;
+  params.delta_dc[kPlaneV] = 3;
+
+  // Test lookups of Dc_Qlookup[0][0], Dc_Qlookup[0][11], Dc_Qlookup[0][12],
+  // and Dc_Qlookup[0][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(8, &params);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 16);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 17);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 16);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 17);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 16);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 17);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 252), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 1336);
+  }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Test lookups of Dc_Qlookup[1][0], Dc_Qlookup[1][11], Dc_Qlookup[1][12],
+  // and Dc_Qlookup[1][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(10, &params);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 34);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 37);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 34);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 37);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 34);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 37);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 254), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 5347);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+  // Test lookups of Dc_Qlookup[2][0], Dc_Qlookup[2][11], Dc_Qlookup[2][12],
+  // and Dc_Qlookup[2][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(12, &params);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 103);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 115);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 21387);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 21387);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 103);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 115);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 21387);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 21387);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 103);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 115);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 254), 21387);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 21387);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+}
+
+TEST(QuantizerTest, GetAcValue) {
+  QuantizerParameters params = {};
+  params.delta_ac[kPlaneU] = 1;
+  params.delta_ac[kPlaneV] = 2;
+
+  // Test lookups of Ac_Qlookup[0][0], Ac_Qlookup[0][11], Ac_Qlookup[0][12],
+  // and Ac_Qlookup[0][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(8, &params);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 18);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 19);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 18);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 19);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 18);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 19);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 1828);
+  }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Test lookups of Ac_Qlookup[1][0], Ac_Qlookup[1][11], Ac_Qlookup[1][12],
+  // and Ac_Qlookup[1][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(10, &params);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 37);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 40);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 37);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 40);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 37);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 40);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 7312);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+  // Test lookups of Ac_Qlookup[1][0], Ac_Qlookup[1][11], Ac_Qlookup[1][12],
+  // and Ac_Qlookup[1][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(12, &params);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 112);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 126);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 29247);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 29247);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 112);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 126);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 29247);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 29247);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 112);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 126);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 29247);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 29247);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/reconstruction.cc b/src/reconstruction.cc
new file mode 100644 (file)
index 0000000..bf48137
--- /dev/null
@@ -0,0 +1,190 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+// Maps TransformType to dsp::Transform1d for the row transforms.
+constexpr dsp::Transform1d kRowTransform[kNumTransformTypes] = {
+    dsp::kTransform1dDct,      dsp::kTransform1dAdst,
+    dsp::kTransform1dDct,      dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst,     dsp::kTransform1dDct,
+    dsp::kTransform1dAdst,     dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst,     dsp::kTransform1dIdentity,
+    dsp::kTransform1dIdentity, dsp::kTransform1dDct,
+    dsp::kTransform1dIdentity, dsp::kTransform1dAdst,
+    dsp::kTransform1dIdentity, dsp::kTransform1dAdst};
+
+// Maps TransformType to dsp::Transform1d for the column transforms.
+constexpr dsp::Transform1d kColumnTransform[kNumTransformTypes] = {
+    dsp::kTransform1dDct,  dsp::kTransform1dDct,
+    dsp::kTransform1dAdst, dsp::kTransform1dAdst,
+    dsp::kTransform1dDct,  dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst, dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst, dsp::kTransform1dIdentity,
+    dsp::kTransform1dDct,  dsp::kTransform1dIdentity,
+    dsp::kTransform1dAdst, dsp::kTransform1dIdentity,
+    dsp::kTransform1dAdst, dsp::kTransform1dIdentity};
+
+dsp::Transform1dSize GetTransform1dSize(int size_log2) {
+  return static_cast<dsp::Transform1dSize>(size_log2 - 2);
+}
+
+// Returns the number of rows to process based on |non_zero_coeff_count|. The
+// transform loops process either 4 or a multiple of 8 rows. Use the
+// TransformClass derived from |tx_type| to determine the scan order.
+template <int tx_width>
+int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) {
+  const TransformClass tx_class = GetTransformClass(tx_type);
+
+  switch (tx_class) {
+    case kTransformClass2D:
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 13) return 4;
+        if (non_zero_coeff_count <= 29) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4;
+        if (non_zero_coeff_count <= 43) return 8;
+        if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8;
+        if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
+      }
+      break;
+
+    case kTransformClassHorizontal:
+      if (non_zero_coeff_count <= 4) return 4;
+      if (non_zero_coeff_count <= 8) return 8;
+      if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
+      if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
+      break;
+
+    default:
+      assert(tx_class == kTransformClassVertical);
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 16) return 4;
+        if (non_zero_coeff_count <= 32) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 32) return 4;
+        if (non_zero_coeff_count <= 64) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 8x8: 63, 8x16: 127.
+        if (non_zero_coeff_count <= 128) return 16;
+        if (non_zero_coeff_count <= 192) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 64) return 4;
+        if (non_zero_coeff_count <= 128) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 16x8: 127, 16x16: 255.
+        if (non_zero_coeff_count <= 256) return 16;
+        if (non_zero_coeff_count <= 384) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 128) return 4;
+        if (non_zero_coeff_count <= 256) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 32x8 is 255, 32x16 is 511.
+        if ((non_zero_coeff_count <= 512)) return 16;
+        if ((non_zero_coeff_count <= 768)) return 24;
+      }
+      break;
+  }
+  return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height;
+}
+
+}  // namespace
+
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                 TransformSize tx_size, bool lossless, Residual* const buffer,
+                 int start_x, int start_y, Array2DView<Pixel>* frame,
+                 int non_zero_coeff_count) {
+  static_assert(sizeof(Residual) == 2 || sizeof(Residual) == 4, "");
+  const int tx_width_log2 = kTransformWidthLog2[tx_size];
+  const int tx_height_log2 = kTransformHeightLog2[tx_size];
+
+  int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size];
+  if (tx_height > 4) {
+    static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height,
+                                          int non_zero_coeff_count) = {
+        &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>,
+        &GetNumRows<32>};
+    tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height,
+                                               non_zero_coeff_count);
+  }
+  assert(tx_height <= 32);
+
+  // Row transform.
+  const dsp::Transform1dSize row_transform_size =
+      GetTransform1dSize(tx_width_log2);
+  const dsp::Transform1d row_transform =
+      lossless ? dsp::kTransform1dWht : kRowTransform[tx_type];
+  const dsp::InverseTransformAddFunc row_transform_func =
+      dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow];
+  assert(row_transform_func != nullptr);
+
+  row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                     frame);
+
+  // Column transform.
+  const dsp::Transform1dSize column_transform_size =
+      GetTransform1dSize(tx_height_log2);
+  const dsp::Transform1d column_transform =
+      lossless ? dsp::kTransform1dWht : kColumnTransform[tx_type];
+  const dsp::InverseTransformAddFunc column_transform_func =
+      dsp.inverse_transforms[column_transform][column_transform_size]
+                            [dsp::kColumn];
+  assert(column_transform_func != nullptr);
+
+  column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                        frame);
+}
+
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                          TransformSize tx_size, bool lossless, int16_t* buffer,
+                          int start_x, int start_y, Array2DView<uint8_t>* frame,
+                          int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                          TransformSize tx_size, bool lossless, int32_t* buffer,
+                          int start_x, int start_y,
+                          Array2DView<uint16_t>* frame,
+                          int non_zero_coeff_count);
+#endif
+
+}  // namespace libgav1
diff --git a/src/reconstruction.h b/src/reconstruction.h
new file mode 100644 (file)
index 0000000..6d5b115
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RECONSTRUCTION_H_
+#define LIBGAV1_SRC_RECONSTRUCTION_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the frame for the
+// transform block size |tx_size| starting at position |start_x| and |start_y|.
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                 TransformSize tx_size, bool lossless, Residual* buffer,
+                 int start_x, int start_y, Array2DView<Pixel>* frame,
+                 int non_zero_coeff_count);
+
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                                 TransformSize tx_size, bool lossless,
+                                 int16_t* buffer, int start_x, int start_y,
+                                 Array2DView<uint8_t>* frame,
+                                 int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                                 TransformSize tx_size, bool lossless,
+                                 int32_t* buffer, int start_x, int start_y,
+                                 Array2DView<uint16_t>* frame,
+                                 int non_zero_coeff_count);
+#endif
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_RECONSTRUCTION_H_
diff --git a/src/reconstruction_test.cc b/src/reconstruction_test.cc
new file mode 100644 (file)
index 0000000..4d09ada
--- /dev/null
@@ -0,0 +1,293 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+// Import the scan tables in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+constexpr int kTestTransformSize = 4;
+constexpr int8_t kTestBitdepth = 8;
+
+using testing::ElementsAreArray;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class ReconstructionTest : public testing::TestWithParam<int> {
+ public:
+  ReconstructionTest() = default;
+  ReconstructionTest(const ReconstructionTest&) = delete;
+  ReconstructionTest& operator=(const ReconstructionTest&) = delete;
+  ~ReconstructionTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(kTestBitdepth);
+    dsp::InverseTransformInit_C();
+    dsp_ = dsp::GetDspTable(kTestBitdepth);
+    ASSERT_NE(dsp_, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    if (test_info->value_param() != nullptr) {
+      const char* const test_case = test_info->test_suite_name();
+      if (absl::StartsWith(test_case, "C/")) {
+      } else if (absl::StartsWith(test_case, "SSE41/")) {
+        if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+        dsp::InverseTransformInit_SSE4_1();
+      } else if (absl::StartsWith(test_case, "NEON/")) {
+        dsp::InverseTransformInit_NEON();
+      } else {
+        FAIL() << "Unrecognized architecture prefix in test case name: "
+               << test_case;
+      }
+    }
+    InitBuffers();
+  }
+
+  void InitBuffers(int width = kTestTransformSize,
+                   int height = kTestTransformSize) {
+    const int size = width * height;
+    buffer_.clear();
+    buffer_.resize(size);
+    residual_buffer_.clear();
+    residual_buffer_.resize(size);
+    for (int i = 0; i < size; ++i) {
+      buffer_[i] = residual_buffer_[i] = i % 256;
+    }
+    frame_buffer_.Reset(height, width, buffer_.data());
+  }
+
+  template <int bitdepth>
+  void TestWht();
+
+  std::vector<uint8_t> buffer_;
+  std::vector<int16_t> residual_buffer_;
+  // |frame_buffer_| is just a 2D array view into the |buffer_|.
+  Array2DView<uint8_t> frame_buffer_;
+  const dsp::Dsp* dsp_;
+};
+
+template <int bitdepth>
+void ReconstructionTest::TestWht() {
+  static_assert(bitdepth == kBitdepth8 || bitdepth == kBitdepth10, "");
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dWht][dsp::kTransform1dSize4]) {
+    if (transform == nullptr) {
+      GTEST_SKIP() << "No function available for dsp::kTransform1dWht";
+    }
+  }
+  constexpr int max = 16 << bitdepth;
+  constexpr int min = -max;
+  static constexpr int16_t residual_inputs[][16]{
+      {64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      {69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, max - 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, min - 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      // Note these are unrealistic inputs, but serve to test each position in
+      // the array and match extremes in some commercial test vectors.
+      {max, max, max, max, max, max, max, max, max, max, max, max, max, max,
+       max, max},
+      {min, min, min, min, min, min, min, min, min, min, min, min, min, min,
+       min, min}};
+  // Before the Reconstruct() call, the frame buffer is filled with all 127.
+  // After the Reconstruct() call, the frame buffer is expected to have the
+  // following values.
+  static constexpr uint8_t frame_outputs[][16]{
+      {131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
+       131, 131},
+      {132, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
+       131, 131},
+      {255, 255, 0, 0, 255, 255, 0, 0, 0, 0, 255, 255, 0, 0, 255, 255},
+      {0, 0, 255, 255, 0, 0, 255, 255, 255, 255, 0, 0, 255, 255, 0, 0},
+      {255, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+       127, 127},
+      {0, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+       127},
+  };
+
+  const TransformSize tx_size = kTransformSize4x4;
+  const TransformType tx_type = kTransformTypeDctDct;
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const uint16_t* const scan = kScan[GetTransformClass(tx_type)][tx_size];
+
+  InitBuffers(tx_width, tx_height);
+
+  const int num_tests = sizeof(residual_inputs) / sizeof(residual_inputs[0]);
+  for (int i = 0; i < num_tests; ++i) {
+    int16_t eob;  // Also known as non_zero_coeff_count.
+    for (eob = 15; eob >= 0; --eob) {
+      if (residual_inputs[i][scan[eob]] != 0) break;
+    }
+    ++eob;
+    memcpy(residual_buffer_.data(), residual_inputs[i],
+           sizeof(residual_inputs[i]));
+    memset(buffer_.data(), 127, sizeof(frame_outputs[i]));
+    Reconstruct(*dsp_, tx_type, tx_size, /*lossless=*/true,
+                residual_buffer_.data(), 0, 0, &frame_buffer_, eob);
+
+    EXPECT_TRUE(test_utils::CompareBlocks(buffer_.data(), frame_outputs[i],
+                                          tx_width, tx_height, tx_width,
+                                          tx_width, false, true))
+        << "Mismatch WHT test case " << i;
+  }
+}
+
+TEST_P(ReconstructionTest, ReconstructionSimple) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeIdentityIdentity, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_output_buffer[] = {
+      0, 1, 2, 3,
+      5, 6, 7, 8,
+      9, 10, 11, 12,
+      14, 15, 16, 17
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_output_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipY) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeIdentityFlipadst, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 1, 2, 3,
+      4, 5, 6, 7,
+      7, 8, 9, 10,
+      14, 15, 16, 17
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipX) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeFlipadstIdentity, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 1, 2, 3,
+      4, 5, 6, 8,
+      8, 10, 10, 13,
+      12, 14, 14, 18
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipXAndFlipY) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeFlipadstFlipadst, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 1, 2, 3,
+      4, 5, 6, 8,
+      8, 8, 10, 9,
+      12, 14, 14, 19
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionNonZeroStart) {
+  uint8_t buffer[64] = {};
+  Array2DView<uint8_t> frame_buffer(8, 8, buffer);
+  int k = 0;
+  for (int i = 0; i < kTestTransformSize; ++i) {
+    for (int j = 0; j < kTestTransformSize; ++j) {
+      frame_buffer[i + 4][j + 4] = k++;
+    }
+  }
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeIdentityIdentity, kTransformSize4x4, false,
+              residual_buffer_.data(), 4, 4, &frame_buffer, 64);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 1, 2, 3,
+      0, 0, 0, 0, 5, 6, 7, 8,
+      0, 0, 0, 0, 9, 10, 11, 12,
+      0, 0, 0, 0, 14, 15, 16, 17
+  };
+  // clang-format on
+  EXPECT_THAT(buffer, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, Wht8bit) { TestWht<kBitdepth8>(); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+TEST_P(ReconstructionTest, Wht10bit) { TestWht<kBitdepth10>(); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, ReconstructionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ReconstructionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ReconstructionTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc
new file mode 100644 (file)
index 0000000..44a842c
--- /dev/null
@@ -0,0 +1,143 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <utility>
+
+namespace libgav1 {
+namespace {
+
+// The maximum queue size is derived using the following formula:
+//   ((sb_size * sb_size) / 16) + (2 * (((sb_size / x) * (sb_size / y)) / 16)).
+// Where:
+//   sb_size is the superblock size (64 or 128).
+//   16 is 4*4 which is kMinTransformWidth * kMinTransformHeight.
+//   x is subsampling_x + 1.
+//   y is subsampling_y + 1.
+// The first component is for the Y plane and the second component is for the U
+// and V planes.
+// For example, for 128x128 superblocks with 422 subsampling the size is:
+//   ((128 * 128) / 16) + (2 * (((128 / 2) * (128 / 1)) / 16)) = 2048.
+//
+// First dimension: use_128x128_superblock.
+// Second dimension: subsampling_x.
+// Third dimension: subsampling_y.
+constexpr int kMaxQueueSize[2][2][2] = {
+    // 64x64 superblocks.
+    {
+        {768, 512},
+        {512, 384},
+    },
+    // 128x128 superblocks.
+    {
+        {3072, 2048},
+        {2048, 1536},
+    },
+};
+
+}  // namespace
+
+ResidualBufferStack::~ResidualBufferStack() {
+  while (top_ != nullptr) {
+    ResidualBuffer* top = top_;
+    top_ = top_->next_;
+    delete top;
+  }
+}
+
+void ResidualBufferStack::Push(std::unique_ptr<ResidualBuffer> buffer) {
+  buffer->next_ = top_;
+  top_ = buffer.release();
+  ++num_buffers_;
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferStack::Pop() {
+  std::unique_ptr<ResidualBuffer> top;
+  if (top_ != nullptr) {
+    top.reset(top_);
+    top_ = top_->next_;
+    top->next_ = nullptr;
+    --num_buffers_;
+  }
+  return top;
+}
+
+void ResidualBufferStack::Swap(ResidualBufferStack* other) {
+  std::swap(top_, other->top_);
+  std::swap(num_buffers_, other->num_buffers_);
+}
+
+ResidualBufferPool::ResidualBufferPool(bool use_128x128_superblock,
+                                       int subsampling_x, int subsampling_y,
+                                       size_t residual_size)
+    : buffer_size_(GetResidualBufferSize(
+          use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+          subsampling_x, subsampling_y, residual_size)),
+      queue_size_(kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+                               [subsampling_x][subsampling_y]) {}
+
+void ResidualBufferPool::Reset(bool use_128x128_superblock, int subsampling_x,
+                               int subsampling_y, size_t residual_size) {
+  const size_t buffer_size = GetResidualBufferSize(
+      use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+      subsampling_x, subsampling_y, residual_size);
+  const int queue_size = kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+                                      [subsampling_x][subsampling_y];
+  if (buffer_size == buffer_size_ && queue_size == queue_size_) {
+    // The existing buffers (if any) are still valid, so don't do anything.
+    return;
+  }
+  buffer_size_ = buffer_size;
+  queue_size_ = queue_size;
+  // The existing buffers (if any) are no longer valid since the buffer size or
+  // the queue size has changed. Clear the stack.
+  ResidualBufferStack buffers;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Move the buffers in the stack to the local variable |buffers| and clear
+    // the stack.
+    buffers.Swap(&buffers_);
+    // Release mutex_ before freeing the buffers.
+  }
+  // As the local variable |buffers| goes out of scope, its destructor frees
+  // the buffers that were in the stack.
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() {
+  std::unique_ptr<ResidualBuffer> buffer = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffer = buffers_.Pop();
+  }
+  if (buffer == nullptr) {
+    buffer = ResidualBuffer::Create(buffer_size_, queue_size_);
+  }
+  return buffer;
+}
+
+void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
+  buffer->transform_parameters()->Clear();
+  buffer->partition_tree_order()->Clear();
+  std::lock_guard<std::mutex> lock(mutex_);
+  buffers_.Push(std::move(buffer));
+}
+
+size_t ResidualBufferPool::Size() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return buffers_.Size();
+}
+
+}  // namespace libgav1
diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h
new file mode 100644 (file)
index 0000000..75924db
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+#define LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This class is used for parsing and decoding a superblock. Members of this
+// class are populated in the "parse" step and consumed in the "decode" step.
+class ResidualBuffer : public Allocable {
+ public:
+  static std::unique_ptr<ResidualBuffer> Create(size_t buffer_size,
+                                                int queue_size) {
+    std::unique_ptr<ResidualBuffer> buffer(new (std::nothrow) ResidualBuffer);
+    if (buffer != nullptr) {
+      buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
+      if (buffer->buffer_ == nullptr ||
+          !buffer->transform_parameters_.Init(queue_size) ||
+          !buffer->partition_tree_order_.Init(queue_size)) {
+        buffer = nullptr;
+      }
+    }
+    return buffer;
+  }
+
+  // Move only.
+  ResidualBuffer(ResidualBuffer&& other) = default;
+  ResidualBuffer& operator=(ResidualBuffer&& other) = default;
+
+  // Buffer used to store the residual values.
+  uint8_t* buffer() { return buffer_.get(); }
+  // Queue used to store the transform parameters.
+  Queue<TransformParameters>* transform_parameters() {
+    return &transform_parameters_;
+  }
+  // Queue used to store the block ordering in the partition tree of the
+  // superblocks.
+  Queue<PartitionTreeNode>* partition_tree_order() {
+    return &partition_tree_order_;
+  }
+
+ private:
+  friend class ResidualBufferStack;
+
+  ResidualBuffer() = default;
+
+  AlignedUniquePtr<uint8_t> buffer_;
+  Queue<TransformParameters> transform_parameters_;
+  Queue<PartitionTreeNode> partition_tree_order_;
+  // Used by ResidualBufferStack to form a chain of ResidualBuffers.
+  ResidualBuffer* next_ = nullptr;
+};
+
+// A LIFO stack of ResidualBuffers. Owns the buffers in the stack.
+class ResidualBufferStack {
+ public:
+  ResidualBufferStack() = default;
+
+  // Not copyable or movable
+  ResidualBufferStack(const ResidualBufferStack&) = delete;
+  ResidualBufferStack& operator=(const ResidualBufferStack&) = delete;
+
+  ~ResidualBufferStack();
+
+  // Pushes |buffer| to the top of the stack.
+  void Push(std::unique_ptr<ResidualBuffer> buffer);
+
+  // If the stack is non-empty, returns the buffer at the top of the stack and
+  // removes it from the stack. If the stack is empty, returns nullptr.
+  std::unique_ptr<ResidualBuffer> Pop();
+
+  // Swaps the contents of this stack and |other|.
+  void Swap(ResidualBufferStack* other);
+
+  // Returns the number of buffers in the stack.
+  size_t Size() const { return num_buffers_; }
+
+ private:
+  // A singly-linked list of ResidualBuffers, chained together using the next_
+  // field of ResidualBuffer.
+  ResidualBuffer* top_ = nullptr;
+  size_t num_buffers_ = 0;
+};
+
+// Utility class used to manage the residual buffers (and the transform
+// parameters) used for multi-threaded decoding. This class uses a stack to
+// store the buffers for better cache locality. Since buffers used more recently
+// are more likely to be in the cache. All functions in this class are
+// thread-safe.
+class ResidualBufferPool : public Allocable {
+ public:
+  ResidualBufferPool(bool use_128x128_superblock, int subsampling_x,
+                     int subsampling_y, size_t residual_size);
+
+  // Recomputes |buffer_size_| and invalidates the existing buffers if
+  // necessary.
+  void Reset(bool use_128x128_superblock, int subsampling_x, int subsampling_y,
+             size_t residual_size);
+  // Gets a residual buffer. The buffer is guaranteed to be large enough to
+  // store the residual values for one superblock whose parameters are the same
+  // as the constructor or the last call to Reset(). If there are free buffers
+  // in the stack, it returns one from the stack, otherwise a new buffer is
+  // allocated.
+  std::unique_ptr<ResidualBuffer> Get();
+  // Returns the |buffer| back to the pool (by appending it to the stack).
+  // Subsequent calls to Get() may re-use this buffer.
+  void Release(std::unique_ptr<ResidualBuffer> buffer);
+
+  // Used only in the tests. Returns the number of buffers in the stack.
+  size_t Size() const;
+
+ private:
+  mutable std::mutex mutex_;
+  ResidualBufferStack buffers_ LIBGAV1_GUARDED_BY(mutex_);
+  size_t buffer_size_;
+  int queue_size_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
diff --git a/src/residual_buffer_pool_test.cc b/src/residual_buffer_pool_test.cc
new file mode 100644 (file)
index 0000000..84bc747
--- /dev/null
@@ -0,0 +1,201 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/queue.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(ResidualBufferTest, TestUsage) {
+  ResidualBufferPool pool(true, 1, 1, sizeof(int16_t));
+  EXPECT_EQ(pool.Size(), 0);
+  // Get one buffer.
+  std::unique_ptr<ResidualBuffer> buffer1 = pool.Get();
+  uint8_t* const buffer1_ptr = buffer1->buffer();
+  ASSERT_NE(buffer1_ptr, nullptr);
+  // Get another buffer (while holding on to the first one).
+  std::unique_ptr<ResidualBuffer> buffer2 = pool.Get();
+  uint8_t* const buffer2_ptr = buffer2->buffer();
+  ASSERT_NE(buffer2_ptr, nullptr);
+  EXPECT_NE(buffer1_ptr, buffer2_ptr);
+  // Return the second buffer.
+  pool.Release(std::move(buffer2));
+  EXPECT_EQ(pool.Size(), 1);
+  // Get another buffer (this one should be the same as the buffer2).
+  std::unique_ptr<ResidualBuffer> buffer3 = pool.Get();
+  uint8_t* const buffer3_ptr = buffer3->buffer();
+  ASSERT_NE(buffer3_ptr, nullptr);
+  EXPECT_EQ(buffer3_ptr, buffer2_ptr);
+  EXPECT_EQ(pool.Size(), 0);
+  // Get another buffer (this one will be a new buffer).
+  std::unique_ptr<ResidualBuffer> buffer4 = pool.Get();
+  uint8_t* const buffer4_ptr = buffer4->buffer();
+  ASSERT_NE(buffer4_ptr, nullptr);
+  EXPECT_NE(buffer4_ptr, buffer1_ptr);
+  EXPECT_NE(buffer4_ptr, buffer3_ptr);
+  EXPECT_EQ(pool.Size(), 0);
+  // Return all the buffers.
+  pool.Release(std::move(buffer1));
+  EXPECT_EQ(pool.Size(), 1);
+  pool.Release(std::move(buffer3));
+  EXPECT_EQ(pool.Size(), 2);
+  pool.Release(std::move(buffer4));
+  EXPECT_EQ(pool.Size(), 3);
+  // Reset the buffer with same parameters.
+  pool.Reset(true, 1, 1, sizeof(int16_t));
+  EXPECT_EQ(pool.Size(), 3);
+  // Reset the buffer size with different parameters.
+  pool.Reset(true, 0, 1, sizeof(int32_t));
+  // The existing buffers should now have been invalidated.
+  EXPECT_EQ(pool.Size(), 0);
+  // Get and return a buffer.
+  std::unique_ptr<ResidualBuffer> buffer5 = pool.Get();
+  uint8_t* const buffer5_ptr = buffer5->buffer();
+  ASSERT_NE(buffer5_ptr, nullptr);
+  pool.Release(std::move(buffer5));
+  EXPECT_EQ(pool.Size(), 1);
+  // Reset the buffer with different value for use128x128_superblock.
+  pool.Reset(false, 0, 1, sizeof(int32_t));
+  // The existing buffers should now have been invalidated.
+  EXPECT_EQ(pool.Size(), 0);
+}
+
+TEST(ResidualBufferTest, TestQueue) {
+  ResidualBufferPool pool(true, 1, 1, sizeof(int16_t));
+  EXPECT_EQ(pool.Size(), 0);
+  // Get one buffer.
+  std::unique_ptr<ResidualBuffer> buffer1 = pool.Get();
+  uint8_t* const buffer1_ptr = buffer1->buffer();
+  ASSERT_NE(buffer1_ptr, nullptr);
+  auto* queue1 = buffer1->transform_parameters();
+  queue1->Push(TransformParameters(kTransformTypeAdstAdst, 10));
+  EXPECT_EQ(queue1->Size(), 1);
+  EXPECT_EQ(queue1->Front().type, kTransformTypeAdstAdst);
+  EXPECT_EQ(queue1->Front().non_zero_coeff_count, 10);
+  queue1->Push(TransformParameters(kTransformTypeDctDct, 20));
+  EXPECT_EQ(queue1->Size(), 2);
+  EXPECT_EQ(queue1->Front().type, kTransformTypeAdstAdst);
+  EXPECT_EQ(queue1->Front().non_zero_coeff_count, 10);
+  queue1->Pop();
+  EXPECT_EQ(queue1->Size(), 1);
+  EXPECT_EQ(queue1->Front().type, kTransformTypeDctDct);
+  EXPECT_EQ(queue1->Front().non_zero_coeff_count, 20);
+  // Return the buffer.
+  pool.Release(std::move(buffer1));
+  EXPECT_EQ(pool.Size(), 1);
+  // Get another buffer (should be the same as buffer1).
+  std::unique_ptr<ResidualBuffer> buffer2 = pool.Get();
+  uint8_t* const buffer2_ptr = buffer2->buffer();
+  ASSERT_NE(buffer2_ptr, nullptr);
+  EXPECT_EQ(buffer1_ptr, buffer2_ptr);
+  // Releasing the buffer should've cleared the queue.
+  EXPECT_EQ(buffer2->transform_parameters()->Size(), 0);
+}
+
+TEST(ResidualBufferTest, TestStackPushPop) {
+  ResidualBufferStack buffers;
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(buffers.Pop(), nullptr);
+
+  std::unique_ptr<ResidualBuffer> buffer0 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer0_ptr = buffer0.get();
+  EXPECT_NE(buffer0_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer1 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer1_ptr = buffer1.get();
+  EXPECT_NE(buffer1_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer2 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer2_ptr = buffer2.get();
+  EXPECT_NE(buffer2_ptr, nullptr);
+
+  // Push two buffers onto the stack.
+  buffers.Push(std::move(buffer0));
+  EXPECT_EQ(buffers.Size(), 1);
+  buffers.Push(std::move(buffer1));
+  EXPECT_EQ(buffers.Size(), 2);
+
+  // Pop one buffer off the stack.
+  std::unique_ptr<ResidualBuffer> top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 1);
+  EXPECT_EQ(top.get(), buffer1_ptr);
+
+  // Push one buffer onto the stack.
+  buffers.Push(std::move(buffer2));
+  EXPECT_EQ(buffers.Size(), 2);
+
+  // Pop two buffers off the stack
+  top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 1);
+  EXPECT_EQ(top.get(), buffer2_ptr);
+  top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(top.get(), buffer0_ptr);
+
+  // Try to pop a buffer off an empty stack.
+  top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(top, nullptr);
+}
+
+TEST(ResidualBufferTest, TestStackSwap) {
+  ResidualBufferStack buffers;
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(buffers.Pop(), nullptr);
+
+  std::unique_ptr<ResidualBuffer> buffer0 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer0_ptr = buffer0.get();
+  EXPECT_NE(buffer0_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer1 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer1_ptr = buffer1.get();
+  EXPECT_NE(buffer1_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer2 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer2_ptr = buffer2.get();
+  EXPECT_NE(buffer2_ptr, nullptr);
+
+  // Push three buffers onto the stack.
+  buffers.Push(std::move(buffer0));
+  EXPECT_EQ(buffers.Size(), 1);
+  buffers.Push(std::move(buffer1));
+  EXPECT_EQ(buffers.Size(), 2);
+  buffers.Push(std::move(buffer2));
+  EXPECT_EQ(buffers.Size(), 3);
+
+  // Swap the contents of the stacks.
+  ResidualBufferStack swapped;
+  swapped.Swap(&buffers);
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(swapped.Size(), 3);
+
+  // Pop three buffers off the swapped stack.
+  std::unique_ptr<ResidualBuffer> top = swapped.Pop();
+  EXPECT_EQ(swapped.Size(), 2);
+  EXPECT_EQ(top.get(), buffer2_ptr);
+  top = swapped.Pop();
+  EXPECT_EQ(swapped.Size(), 1);
+  EXPECT_EQ(top.get(), buffer1_ptr);
+  top = swapped.Pop();
+  EXPECT_EQ(swapped.Size(), 0);
+  EXPECT_EQ(top.get(), buffer0_ptr);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/scan_tables.inc b/src/scan_tables.inc
new file mode 100644 (file)
index 0000000..f7c9231
--- /dev/null
@@ -0,0 +1,440 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains all the scan order tables.
+
+constexpr uint16_t kDefaultScan4x4[16] = {0, 1,  4,  8,  5, 2,  3,  6,
+                                          9, 12, 13, 10, 7, 11, 14, 15};
+
+constexpr uint16_t kColumnScan4x4[16] = {0, 4, 8,  12, 1, 5, 9,  13,
+                                         2, 6, 10, 14, 3, 7, 11, 15};
+
+constexpr uint16_t kRowScan4x4[16] = {0, 1, 2,  3,  4,  5,  6,  7,
+                                      8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr uint16_t kDefaultScan4x8[32] = {
+    0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31};
+
+constexpr uint16_t kColumnScan4x8[32] = {
+    0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+    2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31};
+
+constexpr uint16_t kRowScan4x8[32] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x4[32] = {
+    0,  8, 1,  16, 9,  2, 24, 17, 10, 3, 25, 18, 11, 4,  26, 19,
+    12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31};
+
+constexpr uint16_t kColumnScan8x4[32] = {
+    0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+    4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31};
+
+constexpr uint16_t kRowScan8x4[32] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x8[64] = {
+    0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63};
+
+constexpr uint16_t kColumnScan8x8[64] = {
+    0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+    2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+    4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+    6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63};
+
+constexpr uint16_t kRowScan8x8[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x16[128] = {
+    0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+    5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+    21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+    30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+    39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+    96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+    105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+    114, 121, 87,  94,  101, 108, 115, 122, 95,  102, 109, 116, 123, 103, 110,
+    117, 124, 111, 118, 125, 119, 126, 127};
+
+constexpr uint16_t kColumnScan8x16[128] = {
+    0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+    1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+    2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+    3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+    4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+    5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+    6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+    7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127};
+
+constexpr uint16_t kRowScan8x16[128] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x8[128] = {
+    0,  16,  1,   32, 17,  2,   48,  33,  18, 3,  64,  49,  34,  19,  4,   80,
+    65, 50,  35,  20, 5,   96,  81,  66,  51, 36, 21,  6,   112, 97,  82,  67,
+    52, 37,  22,  7,  113, 98,  83,  68,  53, 38, 23,  8,   114, 99,  84,  69,
+    54, 39,  24,  9,  115, 100, 85,  70,  55, 40, 25,  10,  116, 101, 86,  71,
+    56, 41,  26,  11, 117, 102, 87,  72,  57, 42, 27,  12,  118, 103, 88,  73,
+    58, 43,  28,  13, 119, 104, 89,  74,  59, 44, 29,  14,  120, 105, 90,  75,
+    60, 45,  30,  15, 121, 106, 91,  76,  61, 46, 31,  122, 107, 92,  77,  62,
+    47, 123, 108, 93, 78,  63,  124, 109, 94, 79, 125, 110, 95,  126, 111, 127};
+
+constexpr uint16_t kColumnScan16x8[128] = {
+    0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+    2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+    4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+    6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+    8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+    10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+    12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+    14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127};
+
+constexpr uint16_t kRowScan16x8[128] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x16[256] = {
+    0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+    5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+    37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+    9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+    85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+    146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+    12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+    224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+    15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+    240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+    31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+    243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+    109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+    170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+    246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+    218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+    255};
+
+constexpr uint16_t kColumnScan16x16[256] = {
+    0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+    1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+    2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+    3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+    4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+    5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+    6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+    7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+    8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+    9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+    10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+    11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+    12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+    13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+    14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+    15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255};
+
+constexpr uint16_t kRowScan16x16[256] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+    135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+    150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+    165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+    180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+    195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+    210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+    225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+    255};
+
+constexpr uint16_t kDefaultScan16x32[512] = {
+    0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
+    5,   20,  35,  50,  65,  80,  6,   21,  36,  51,  66,  81,  96,  7,   22,
+    37,  52,  67,  82,  97,  112, 8,   23,  38,  53,  68,  83,  98,  113, 128,
+    9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 10,  25,  40,  55,  70,
+    85,  100, 115, 130, 145, 160, 11,  26,  41,  56,  71,  86,  101, 116, 131,
+    146, 161, 176, 12,  27,  42,  57,  72,  87,  102, 117, 132, 147, 162, 177,
+    192, 13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+    14,  29,  44,  59,  74,  89,  104, 119, 134, 149, 164, 179, 194, 209, 224,
+    15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+    240, 31,  46,  61,  76,  91,  106, 121, 136, 151, 166, 181, 196, 211, 226,
+    241, 256, 47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227,
+    242, 257, 272, 63,  78,  93,  108, 123, 138, 153, 168, 183, 198, 213, 228,
+    243, 258, 273, 288, 79,  94,  109, 124, 139, 154, 169, 184, 199, 214, 229,
+    244, 259, 274, 289, 304, 95,  110, 125, 140, 155, 170, 185, 200, 215, 230,
+    245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+    246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+    247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+    248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+    249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+    250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+    251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+    252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+    253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+    254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+    255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+    480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+    481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+    482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+    498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+    350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+    411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+    487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+    459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+    491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+    510, 511};
+
+constexpr uint16_t kDefaultScan32x16[512] = {
+    0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+    160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+    162, 131, 100, 69,  38,  7,   256, 225, 194, 163, 132, 101, 70,  39,  8,
+    288, 257, 226, 195, 164, 133, 102, 71,  40,  9,   320, 289, 258, 227, 196,
+    165, 134, 103, 72,  41,  10,  352, 321, 290, 259, 228, 197, 166, 135, 104,
+    73,  42,  11,  384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74,  43,
+    12,  416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75,  44,  13,
+    448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76,  45,  14,
+    480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77,  46,
+    15,  481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+    47,  16,  482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+    79,  48,  17,  483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+    111, 80,  49,  18,  484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+    143, 112, 81,  50,  19,  485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+    175, 144, 113, 82,  51,  20,  486, 455, 424, 393, 362, 331, 300, 269, 238,
+    207, 176, 145, 114, 83,  52,  21,  487, 456, 425, 394, 363, 332, 301, 270,
+    239, 208, 177, 146, 115, 84,  53,  22,  488, 457, 426, 395, 364, 333, 302,
+    271, 240, 209, 178, 147, 116, 85,  54,  23,  489, 458, 427, 396, 365, 334,
+    303, 272, 241, 210, 179, 148, 117, 86,  55,  24,  490, 459, 428, 397, 366,
+    335, 304, 273, 242, 211, 180, 149, 118, 87,  56,  25,  491, 460, 429, 398,
+    367, 336, 305, 274, 243, 212, 181, 150, 119, 88,  57,  26,  492, 461, 430,
+    399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89,  58,  27,  493, 462,
+    431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90,  59,  28,  494,
+    463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91,  60,  29,
+    495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,  61,
+    30,  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+    62,  31,  497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+    94,  63,  498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+    95,  499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+    469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+    377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+    254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+    380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+    382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+    479, 511};
+
+constexpr uint16_t kDefaultScan32x32[1024] = {
+    0,    1,    32,   64,   33,   2,   3,    34,   65,   96,   128,  97,  66,
+    35,   4,    5,    36,   67,   98,  129,  160,  192,  161,  130,  99,  68,
+    37,   6,    7,    38,   69,   100, 131,  162,  193,  224,  256,  225, 194,
+    163,  132,  101,  70,   39,   8,   9,    40,   71,   102,  133,  164, 195,
+    226,  257,  288,  320,  289,  258, 227,  196,  165,  134,  103,  72,  41,
+    10,   11,   42,   73,   104,  135, 166,  197,  228,  259,  290,  321, 352,
+    384,  353,  322,  291,  260,  229, 198,  167,  136,  105,  74,   43,  12,
+    13,   44,   75,   106,  137,  168, 199,  230,  261,  292,  323,  354, 385,
+    416,  448,  417,  386,  355,  324, 293,  262,  231,  200,  169,  138, 107,
+    76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,  263, 294,
+    325,  356,  387,  418,  449,  480, 512,  481,  450,  419,  388,  357, 326,
+    295,  264,  233,  202,  171,  140, 109,  78,   47,   16,   17,   48,  79,
+    110,  141,  172,  203,  234,  265, 296,  327,  358,  389,  420,  451, 482,
+    513,  544,  576,  545,  514,  483, 452,  421,  390,  359,  328,  297, 266,
+    235,  204,  173,  142,  111,  80,  49,   18,   19,   50,   81,   112, 143,
+    174,  205,  236,  267,  298,  329, 360,  391,  422,  453,  484,  515, 546,
+    577,  608,  640,  609,  578,  547, 516,  485,  454,  423,  392,  361, 330,
+    299,  268,  237,  206,  175,  144, 113,  82,   51,   20,   21,   52,  83,
+    114,  145,  176,  207,  238,  269, 300,  331,  362,  393,  424,  455, 486,
+    517,  548,  579,  610,  641,  672, 704,  673,  642,  611,  580,  549, 518,
+    487,  456,  425,  394,  363,  332, 301,  270,  239,  208,  177,  146, 115,
+    84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271, 302,
+    333,  364,  395,  426,  457,  488, 519,  550,  581,  612,  643,  674, 705,
+    736,  768,  737,  706,  675,  644, 613,  582,  551,  520,  489,  458, 427,
+    396,  365,  334,  303,  272,  241, 210,  179,  148,  117,  86,   55,  24,
+    25,   56,   87,   118,  149,  180, 211,  242,  273,  304,  335,  366, 397,
+    428,  459,  490,  521,  552,  583, 614,  645,  676,  707,  738,  769, 800,
+    832,  801,  770,  739,  708,  677, 646,  615,  584,  553,  522,  491, 460,
+    429,  398,  367,  336,  305,  274, 243,  212,  181,  150,  119,  88,  57,
+    26,   27,   58,   89,   120,  151, 182,  213,  244,  275,  306,  337, 368,
+    399,  430,  461,  492,  523,  554, 585,  616,  647,  678,  709,  740, 771,
+    802,  833,  864,  896,  865,  834, 803,  772,  741,  710,  679,  648, 617,
+    586,  555,  524,  493,  462,  431, 400,  369,  338,  307,  276,  245, 214,
+    183,  152,  121,  90,   59,   28,  29,   60,   91,   122,  153,  184, 215,
+    246,  277,  308,  339,  370,  401, 432,  463,  494,  525,  556,  587, 618,
+    649,  680,  711,  742,  773,  804, 835,  866,  897,  928,  960,  929, 898,
+    867,  836,  805,  774,  743,  712, 681,  650,  619,  588,  557,  526, 495,
+    464,  433,  402,  371,  340,  309, 278,  247,  216,  185,  154,  123, 92,
+    61,   30,   31,   62,   93,   124, 155,  186,  217,  248,  279,  310, 341,
+    372,  403,  434,  465,  496,  527, 558,  589,  620,  651,  682,  713, 744,
+    775,  806,  837,  868,  899,  930, 961,  992,  993,  962,  931,  900, 869,
+    838,  807,  776,  745,  714,  683, 652,  621,  590,  559,  528,  497, 466,
+    435,  404,  373,  342,  311,  280, 249,  218,  187,  156,  125,  94,  63,
+    95,   126,  157,  188,  219,  250, 281,  312,  343,  374,  405,  436, 467,
+    498,  529,  560,  591,  622,  653, 684,  715,  746,  777,  808,  839, 870,
+    901,  932,  963,  994,  995,  964, 933,  902,  871,  840,  809,  778, 747,
+    716,  685,  654,  623,  592,  561, 530,  499,  468,  437,  406,  375, 344,
+    313,  282,  251,  220,  189,  158, 127,  159,  190,  221,  252,  283, 314,
+    345,  376,  407,  438,  469,  500, 531,  562,  593,  624,  655,  686, 717,
+    748,  779,  810,  841,  872,  903, 934,  965,  996,  997,  966,  935, 904,
+    873,  842,  811,  780,  749,  718, 687,  656,  625,  594,  563,  532, 501,
+    470,  439,  408,  377,  346,  315, 284,  253,  222,  191,  223,  254, 285,
+    316,  347,  378,  409,  440,  471, 502,  533,  564,  595,  626,  657, 688,
+    719,  750,  781,  812,  843,  874, 905,  936,  967,  998,  999,  968, 937,
+    906,  875,  844,  813,  782,  751, 720,  689,  658,  627,  596,  565, 534,
+    503,  472,  441,  410,  379,  348, 317,  286,  255,  287,  318,  349, 380,
+    411,  442,  473,  504,  535,  566, 597,  628,  659,  690,  721,  752, 783,
+    814,  845,  876,  907,  938,  969, 1000, 1001, 970,  939,  908,  877, 846,
+    815,  784,  753,  722,  691,  660, 629,  598,  567,  536,  505,  474, 443,
+    412,  381,  350,  319,  351,  382, 413,  444,  475,  506,  537,  568, 599,
+    630,  661,  692,  723,  754,  785, 816,  847,  878,  909,  940,  971, 1002,
+    1003, 972,  941,  910,  879,  848, 817,  786,  755,  724,  693,  662, 631,
+    600,  569,  538,  507,  476,  445, 414,  383,  415,  446,  477,  508, 539,
+    570,  601,  632,  663,  694,  725, 756,  787,  818,  849,  880,  911, 942,
+    973,  1004, 1005, 974,  943,  912, 881,  850,  819,  788,  757,  726, 695,
+    664,  633,  602,  571,  540,  509, 478,  447,  479,  510,  541,  572, 603,
+    634,  665,  696,  727,  758,  789, 820,  851,  882,  913,  944,  975, 1006,
+    1007, 976,  945,  914,  883,  852, 821,  790,  759,  728,  697,  666, 635,
+    604,  573,  542,  511,  543,  574, 605,  636,  667,  698,  729,  760, 791,
+    822,  853,  884,  915,  946,  977, 1008, 1009, 978,  947,  916,  885, 854,
+    823,  792,  761,  730,  699,  668, 637,  606,  575,  607,  638,  669, 700,
+    731,  762,  793,  824,  855,  886, 917,  948,  979,  1010, 1011, 980, 949,
+    918,  887,  856,  825,  794,  763, 732,  701,  670,  639,  671,  702, 733,
+    764,  795,  826,  857,  888,  919, 950,  981,  1012, 1013, 982,  951, 920,
+    889,  858,  827,  796,  765,  734, 703,  735,  766,  797,  828,  859, 890,
+    921,  952,  983,  1014, 1015, 984, 953,  922,  891,  860,  829,  798, 767,
+    799,  830,  861,  892,  923,  954, 985,  1016, 1017, 986,  955,  924, 893,
+    862,  831,  863,  894,  925,  956, 987,  1018, 1019, 988,  957,  926, 895,
+    927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023};
+
+constexpr uint16_t kDefaultScan4x16[64] = {
+    0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+    33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+    49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63};
+
+constexpr uint16_t kColumnScan4x16[64] = {
+    0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+    1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+    2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+    3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63};
+
+constexpr uint16_t kRowScan4x16[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan16x4[64] = {
+    0,  16, 1,  32, 17, 2,  48, 33, 18, 3,  49, 34, 19, 4,  50, 35,
+    20, 5,  51, 36, 21, 6,  52, 37, 22, 7,  53, 38, 23, 8,  54, 39,
+    24, 9,  55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+    28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63};
+
+constexpr uint16_t kColumnScan16x4[64] = {
+    0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+    4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+    8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+    12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+constexpr uint16_t kRowScan16x4[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x32[256] = {
+    0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+    5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+    21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+    30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+    39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+    96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+    105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+    114, 121, 128, 87,  94,  101, 108, 115, 122, 129, 136, 95,  102, 109, 116,
+    123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+    132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+    141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+    150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+    159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+    216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+    225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+    234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+    250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+    255};
+
+constexpr uint16_t kDefaultScan32x8[256] = {
+    0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+    160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+    162, 131, 100, 69,  38,  7,   225, 194, 163, 132, 101, 70,  39,  8,   226,
+    195, 164, 133, 102, 71,  40,  9,   227, 196, 165, 134, 103, 72,  41,  10,
+    228, 197, 166, 135, 104, 73,  42,  11,  229, 198, 167, 136, 105, 74,  43,
+    12,  230, 199, 168, 137, 106, 75,  44,  13,  231, 200, 169, 138, 107, 76,
+    45,  14,  232, 201, 170, 139, 108, 77,  46,  15,  233, 202, 171, 140, 109,
+    78,  47,  16,  234, 203, 172, 141, 110, 79,  48,  17,  235, 204, 173, 142,
+    111, 80,  49,  18,  236, 205, 174, 143, 112, 81,  50,  19,  237, 206, 175,
+    144, 113, 82,  51,  20,  238, 207, 176, 145, 114, 83,  52,  21,  239, 208,
+    177, 146, 115, 84,  53,  22,  240, 209, 178, 147, 116, 85,  54,  23,  241,
+    210, 179, 148, 117, 86,  55,  24,  242, 211, 180, 149, 118, 87,  56,  25,
+    243, 212, 181, 150, 119, 88,  57,  26,  244, 213, 182, 151, 120, 89,  58,
+    27,  245, 214, 183, 152, 121, 90,  59,  28,  246, 215, 184, 153, 122, 91,
+    60,  29,  247, 216, 185, 154, 123, 92,  61,  30,  248, 217, 186, 155, 124,
+    93,  62,  31,  249, 218, 187, 156, 125, 94,  63,  250, 219, 188, 157, 126,
+    95,  251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+    255};
+
+// 5.11.41 (implemented as a simple look up of transform class and transform
+// size).
+const uint16_t* kScan[3][kNumTransformSizes] = {
+    // kTransformClass2D
+    {kDefaultScan4x4, kDefaultScan4x8, kDefaultScan4x16, kDefaultScan8x4,
+     kDefaultScan8x8, kDefaultScan8x16, kDefaultScan8x32, kDefaultScan16x4,
+     kDefaultScan16x8, kDefaultScan16x16, kDefaultScan16x32, kDefaultScan16x32,
+     kDefaultScan32x8, kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32,
+     kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+    // kTransformClassHorizontal
+    {kColumnScan4x4, kColumnScan4x8, kColumnScan4x16, kColumnScan8x4,
+     kColumnScan8x8, kColumnScan8x16, kColumnScan16x4, kColumnScan16x4,
+     kColumnScan16x8, kColumnScan16x16, kColumnScan16x4, kDefaultScan16x32,
+     kColumnScan16x4, kColumnScan16x4, kColumnScan16x4, kDefaultScan32x32,
+     kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+    // kTransformClassVertical
+    {kRowScan4x4, kRowScan4x8, kRowScan4x16, kRowScan8x4, kRowScan8x8,
+     kRowScan8x16, kRowScan16x4, kRowScan16x4, kRowScan16x8, kRowScan16x16,
+     kRowScan16x4, kDefaultScan16x32, kRowScan16x4, kRowScan16x4, kRowScan16x4,
+     kDefaultScan32x32, kDefaultScan32x16, kDefaultScan32x32,
+     kDefaultScan32x32}};
diff --git a/src/scan_test.cc b/src/scan_test.cc
new file mode 100644 (file)
index 0000000..065ca03
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+class ScanOrderTest
+    : public testing::TestWithParam<std::tuple<TransformClass, TransformSize>> {
+ public:
+  ScanOrderTest() = default;
+  ScanOrderTest(const ScanOrderTest&) = delete;
+  ScanOrderTest& operator=(const ScanOrderTest&) = delete;
+  ~ScanOrderTest() override = default;
+
+ protected:
+  TransformClass tx_class_ = std::get<0>(GetParam());
+  TransformSize tx_size_ = std::get<1>(GetParam());
+};
+
+TEST_P(ScanOrderTest, AllIndicesAreScannedExactlyOnce) {
+  const int tx_width = kTransformWidth[tx_size_];
+  const int tx_height = kTransformHeight[tx_size_];
+  int num_indices;
+  if (tx_class_ == kTransformClass2D || std::max(tx_width, tx_height) == 64) {
+    const int clamped_tx_width = std::min(32, tx_width);
+    const int clamped_tx_height = std::min(32, tx_height);
+    num_indices = clamped_tx_width * clamped_tx_height;
+  } else {
+    num_indices =
+        (std::max(tx_width, tx_height) > 16) ? 64 : tx_width * tx_height;
+  }
+  const uint16_t* const scan = kScan[tx_class_][tx_size_];
+  ASSERT_NE(scan, nullptr);
+  // Ensure that all the indices are scanned exactly once.
+  std::vector<int> scanned;
+  scanned.resize(num_indices);
+  for (int i = 0; i < num_indices; ++i) {
+    scanned[scan[i]]++;
+  }
+  EXPECT_THAT(scanned, testing::Each(1));
+}
+
+constexpr TransformClass kTestTransformClasses[] = {
+    kTransformClass2D, kTransformClassVertical, kTransformClassHorizontal};
+
+constexpr TransformSize kTestTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(
+    C, ScanOrderTest,
+    testing::Combine(testing::ValuesIn(kTestTransformClasses),
+                     testing::ValuesIn(kTestTransformSizes)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/status_code.cc b/src/status_code.cc
new file mode 100644 (file)
index 0000000..34def08
--- /dev/null
@@ -0,0 +1,57 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/status_code.h"
+
+extern "C" {
+
+const char* Libgav1GetErrorString(Libgav1StatusCode status) {
+  switch (status) {
+    case kLibgav1StatusOk:
+      return "Success.";
+    case kLibgav1StatusUnknownError:
+      return "Unknown error.";
+    case kLibgav1StatusInvalidArgument:
+      return "Invalid function argument.";
+    case kLibgav1StatusOutOfMemory:
+      return "Memory allocation failure.";
+    case kLibgav1StatusResourceExhausted:
+      return "Ran out of a resource (other than memory).";
+    case kLibgav1StatusNotInitialized:
+      return "The object is not initialized.";
+    case kLibgav1StatusAlready:
+      return "An operation that can only be performed once has already been "
+             "performed.";
+    case kLibgav1StatusUnimplemented:
+      return "Not implemented.";
+    case kLibgav1StatusInternalError:
+      return "Internal error in libgav1.";
+    case kLibgav1StatusBitstreamError:
+      return "The bitstream is not encoded correctly or violates a bitstream "
+             "conformance requirement.";
+    case kLibgav1StatusTryAgain:
+      return "The operation is not allowed at the moment. Try again later.";
+    case kLibgav1StatusNothingToDequeue:
+      return "There are no enqueued frames, so there is nothing to dequeue. "
+             "Try enqueuing a frame before trying to dequeue again.";
+    // This switch statement does not have a default case. This way the compiler
+    // will warn if we neglect to update this function after adding a new value
+    // to the Libgav1StatusCode enum type.
+    case kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_:
+      break;
+  }
+  return "Unrecognized status code.";
+}
+
+}  // extern "C"
diff --git a/src/symbol_decoder_context.cc b/src/symbol_decoder_context.cc
new file mode 100644 (file)
index 0000000..26a281e
--- /dev/null
@@ -0,0 +1,322 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/symbol_decoder_context_cdfs.inc"
+
+uint8_t GetQuantizerContext(int base_quantizer_index) {
+  if (base_quantizer_index <= 20) return 0;
+  if (base_quantizer_index <= 60) return 1;
+  if (base_quantizer_index <= 120) return 2;
+  return 3;
+}
+
+// Reset*Counters() are helper functions to reset the CDF arrays where the
+// counters are not in the last element of the innermost dimension.
+
+void ResetPartitionCounters(SymbolDecoderContext* const context) {
+  int block_size_log2 = k4x4WidthLog2[kBlock8x8];
+  for (auto& d1 : context->partition_cdf) {
+    const int cdf_size =
+        SymbolDecoderContext::PartitionCdfSize(block_size_log2++);
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetPaletteColorIndexCounters(SymbolDecoderContext* const context) {
+  for (auto& d1 : context->palette_color_index_cdf) {
+    int cdf_size = kMinPaletteSize;
+    for (auto& d2 : d1) {
+      for (auto& d3 : d2) {
+        d3[cdf_size] = 0;
+      }
+      ++cdf_size;
+    }
+  }
+}
+
+void ResetTxTypeCounters(SymbolDecoderContext* const context) {
+  int set_index = kTransformSetIntra1;
+  for (auto& d1 : context->intra_tx_type_cdf) {
+    const int cdf_size = kNumTransformTypesInSet[set_index++];
+    for (auto& d2 : d1) {
+      for (auto& d3 : d2) {
+        d3[cdf_size] = 0;
+      }
+    }
+  }
+  for (auto& d1 : context->inter_tx_type_cdf) {
+    const int cdf_size = kNumTransformTypesInSet[set_index++];
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetTxDepthCounters(SymbolDecoderContext* const context) {
+  int delta = 1;
+  for (auto& d1 : context->tx_depth_cdf) {
+    const int cdf_size = kMaxTxDepthSymbolCount - delta;
+    delta = 0;
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetUVModeCounters(SymbolDecoderContext* const context) {
+  int cdf_size = kIntraPredictionModesUV - 1;
+  for (auto& d1 : context->uv_mode_cdf) {
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+    ++cdf_size;
+  }
+}
+
+}  // namespace
+
+#define CDF_COPY(source, destination)                       \
+  static_assert(sizeof(source) == sizeof(destination), ""); \
+  memcpy(destination, source, sizeof(source))
+
+void SymbolDecoderContext::Initialize(int base_quantizer_index) {
+  CDF_COPY(kDefaultPartitionCdf, partition_cdf);
+  CDF_COPY(kDefaultSkipCdf, skip_cdf);
+  CDF_COPY(kDefaultSkipModeCdf, skip_mode_cdf);
+  CDF_COPY(kDefaultSegmentIdCdf, segment_id_cdf);
+  CDF_COPY(kDefaultUsePredictedSegmentIdCdf, use_predicted_segment_id_cdf);
+  CDF_COPY(kDefaultDeltaQCdf, delta_q_cdf);
+  CDF_COPY(kDefaultDeltaQCdf, delta_lf_cdf);
+  for (auto& delta_lf_multi_cdf_entry : delta_lf_multi_cdf) {
+    CDF_COPY(kDefaultDeltaQCdf, delta_lf_multi_cdf_entry);
+  }
+  CDF_COPY(kDefaultIntraBlockCopyCdf, intra_block_copy_cdf);
+  CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+  CDF_COPY(kDefaultYModeCdf, y_mode_cdf);
+  CDF_COPY(kDefaultAngleDeltaCdf, angle_delta_cdf);
+  CDF_COPY(kDefaultUVModeCdf, uv_mode_cdf);
+  CDF_COPY(kDefaultCflAlphaSignsCdf, cfl_alpha_signs_cdf);
+  CDF_COPY(kDefaultCflAlphaCdf, cfl_alpha_cdf);
+  CDF_COPY(kDefaultUseFilterIntraCdf, use_filter_intra_cdf);
+  CDF_COPY(kDefaultFilterIntraModeCdf, filter_intra_mode_cdf);
+  CDF_COPY(kDefaultTxDepthCdf, tx_depth_cdf);
+  CDF_COPY(kDefaultTxSplitCdf, tx_split_cdf);
+  CDF_COPY(kDefaultInterTxTypeCdf, inter_tx_type_cdf);
+  CDF_COPY(kDefaultIntraTxTypeCdf, intra_tx_type_cdf);
+  CDF_COPY(kDefaultRestorationTypeCdf, restoration_type_cdf);
+  CDF_COPY(kDefaultUseWienerCdf, use_wiener_cdf);
+  CDF_COPY(kDefaultUseSgrProjCdf, use_sgrproj_cdf);
+  CDF_COPY(kDefaultHasPaletteYCdf, has_palette_y_cdf);
+  CDF_COPY(kDefaultPaletteYSizeCdf, palette_y_size_cdf);
+  CDF_COPY(kDefaultHasPaletteUVCdf, has_palette_uv_cdf);
+  CDF_COPY(kDefaultPaletteUVSizeCdf, palette_uv_size_cdf);
+  CDF_COPY(kDefaultPaletteColorIndexCdf, palette_color_index_cdf);
+  CDF_COPY(kDefaultIsInterCdf, is_inter_cdf);
+  CDF_COPY(kDefaultUseCompoundReferenceCdf, use_compound_reference_cdf);
+  CDF_COPY(kDefaultCompoundReferenceTypeCdf, compound_reference_type_cdf);
+  CDF_COPY(kDefaultCompoundReferenceCdf, compound_reference_cdf);
+  CDF_COPY(kDefaultCompoundBackwardReferenceCdf,
+           compound_backward_reference_cdf);
+  CDF_COPY(kDefaultSingleReferenceCdf, single_reference_cdf);
+  CDF_COPY(kDefaultCompoundPredictionModeCdf, compound_prediction_mode_cdf);
+  CDF_COPY(kDefaultNewMvCdf, new_mv_cdf);
+  CDF_COPY(kDefaultZeroMvCdf, zero_mv_cdf);
+  CDF_COPY(kDefaultReferenceMvCdf, reference_mv_cdf);
+  CDF_COPY(kDefaultRefMvIndexCdf, ref_mv_index_cdf);
+  CDF_COPY(kDefaultIsInterIntraCdf, is_inter_intra_cdf);
+  CDF_COPY(kDefaultInterIntraModeCdf, inter_intra_mode_cdf);
+  CDF_COPY(kDefaultIsWedgeInterIntraCdf, is_wedge_inter_intra_cdf);
+  CDF_COPY(kDefaultWedgeIndexCdf, wedge_index_cdf);
+  CDF_COPY(kDefaultUseObmcCdf, use_obmc_cdf);
+  CDF_COPY(kDefaultMotionModeCdf, motion_mode_cdf);
+  CDF_COPY(kDefaultIsExplicitCompoundTypeCdf, is_explicit_compound_type_cdf);
+  CDF_COPY(kDefaultIsCompoundTypeAverageCdf, is_compound_type_average_cdf);
+  CDF_COPY(kDefaultCompoundTypeCdf, compound_type_cdf);
+  CDF_COPY(kDefaultInterpolationFilterCdf, interpolation_filter_cdf);
+  for (int i = 0; i < kMvContexts; ++i) {
+    CDF_COPY(kDefaultMvJointCdf, mv_joint_cdf[i]);
+    for (int j = 0; j < kNumMvComponents; ++j) {
+      CDF_COPY(kDefaultMvSignCdf, mv_sign_cdf[i][j]);
+      CDF_COPY(kDefaultMvClassCdf, mv_class_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0BitCdf, mv_class0_bit_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0FractionCdf, mv_class0_fraction_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0HighPrecisionCdf,
+               mv_class0_high_precision_cdf[i][j]);
+      CDF_COPY(kDefaultMvBitCdf, mv_bit_cdf[i][j]);
+      CDF_COPY(kDefaultMvFractionCdf, mv_fraction_cdf[i][j]);
+      CDF_COPY(kDefaultMvHighPrecisionCdf, mv_high_precision_cdf[i][j]);
+    }
+  }
+  const int quantizer_context = GetQuantizerContext(base_quantizer_index);
+  CDF_COPY(kDefaultAllZeroCdf[quantizer_context], all_zero_cdf);
+  CDF_COPY(kDefaultEobPt16Cdf[quantizer_context], eob_pt_16_cdf);
+  CDF_COPY(kDefaultEobPt32Cdf[quantizer_context], eob_pt_32_cdf);
+  CDF_COPY(kDefaultEobPt64Cdf[quantizer_context], eob_pt_64_cdf);
+  CDF_COPY(kDefaultEobPt128Cdf[quantizer_context], eob_pt_128_cdf);
+  CDF_COPY(kDefaultEobPt256Cdf[quantizer_context], eob_pt_256_cdf);
+  CDF_COPY(kDefaultEobPt512Cdf[quantizer_context], eob_pt_512_cdf);
+  CDF_COPY(kDefaultEobPt1024Cdf[quantizer_context], eob_pt_1024_cdf);
+  CDF_COPY(kDefaultEobExtraCdf[quantizer_context], eob_extra_cdf);
+  CDF_COPY(kDefaultCoeffBaseEobCdf[quantizer_context], coeff_base_eob_cdf);
+  CDF_COPY(kDefaultCoeffBaseCdf[quantizer_context], coeff_base_cdf);
+  CDF_COPY(kDefaultCoeffBaseRangeCdf[quantizer_context], coeff_base_range_cdf);
+  CDF_COPY(kDefaultDcSignCdf[quantizer_context], dc_sign_cdf);
+}
+
+void SymbolDecoderContext::ResetIntraFrameYModeCdf() {
+  CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+}
+
+#undef CDF_COPY
+
+// These macros set the last element in the inner-most dimension of the array to
+// zero.
+#define RESET_COUNTER_1D(array)                              \
+  do {                                                       \
+    (array)[std::extent<decltype(array), 0>::value - 1] = 0; \
+  } while (false)
+
+#define RESET_COUNTER_2D(array)                           \
+  do {                                                    \
+    for (auto& d1 : (array)) {                            \
+      d1[std::extent<decltype(array), 1>::value - 1] = 0; \
+    }                                                     \
+  } while (false)
+
+#define RESET_COUNTER_3D(array)                             \
+  do {                                                      \
+    for (auto& d1 : (array)) {                              \
+      for (auto& d2 : d1) {                                 \
+        d2[std::extent<decltype(array), 2>::value - 1] = 0; \
+      }                                                     \
+    }                                                       \
+  } while (false)
+
+#define RESET_COUNTER_4D(array)                               \
+  do {                                                        \
+    for (auto& d1 : (array)) {                                \
+      for (auto& d2 : d1) {                                   \
+        for (auto& d3 : d2) {                                 \
+          d3[std::extent<decltype(array), 3>::value - 1] = 0; \
+        }                                                     \
+      }                                                       \
+    }                                                         \
+  } while (false)
+
+void SymbolDecoderContext::ResetCounters() {
+  ResetPartitionCounters(this);
+  RESET_COUNTER_2D(segment_id_cdf);
+  RESET_COUNTER_2D(use_predicted_segment_id_cdf);
+  RESET_COUNTER_2D(skip_cdf);
+  RESET_COUNTER_2D(skip_mode_cdf);
+  RESET_COUNTER_1D(delta_q_cdf);
+  RESET_COUNTER_1D(delta_lf_cdf);
+  RESET_COUNTER_2D(delta_lf_multi_cdf);
+  RESET_COUNTER_1D(intra_block_copy_cdf);
+  RESET_COUNTER_3D(intra_frame_y_mode_cdf);
+  RESET_COUNTER_2D(y_mode_cdf);
+  RESET_COUNTER_2D(angle_delta_cdf);
+  ResetUVModeCounters(this);
+  RESET_COUNTER_1D(cfl_alpha_signs_cdf);
+  RESET_COUNTER_2D(cfl_alpha_cdf);
+  RESET_COUNTER_2D(use_filter_intra_cdf);
+  RESET_COUNTER_1D(filter_intra_mode_cdf);
+  ResetTxDepthCounters(this);
+  RESET_COUNTER_2D(tx_split_cdf);
+  RESET_COUNTER_3D(all_zero_cdf);
+  ResetTxTypeCounters(this);
+  RESET_COUNTER_3D(eob_pt_16_cdf);
+  RESET_COUNTER_3D(eob_pt_32_cdf);
+  RESET_COUNTER_3D(eob_pt_64_cdf);
+  RESET_COUNTER_3D(eob_pt_128_cdf);
+  RESET_COUNTER_3D(eob_pt_256_cdf);
+  RESET_COUNTER_2D(eob_pt_512_cdf);
+  RESET_COUNTER_2D(eob_pt_1024_cdf);
+  RESET_COUNTER_4D(eob_extra_cdf);
+  RESET_COUNTER_4D(coeff_base_eob_cdf);
+  RESET_COUNTER_4D(coeff_base_cdf);
+  RESET_COUNTER_4D(coeff_base_range_cdf);
+  RESET_COUNTER_3D(dc_sign_cdf);
+  RESET_COUNTER_1D(restoration_type_cdf);
+  RESET_COUNTER_1D(use_wiener_cdf);
+  RESET_COUNTER_1D(use_sgrproj_cdf);
+  RESET_COUNTER_3D(has_palette_y_cdf);
+  RESET_COUNTER_2D(palette_y_size_cdf);
+  RESET_COUNTER_2D(has_palette_uv_cdf);
+  RESET_COUNTER_2D(palette_uv_size_cdf);
+  ResetPaletteColorIndexCounters(this);
+  RESET_COUNTER_2D(is_inter_cdf);
+  RESET_COUNTER_2D(use_compound_reference_cdf);
+  RESET_COUNTER_2D(compound_reference_type_cdf);
+  RESET_COUNTER_4D(compound_reference_cdf);
+  RESET_COUNTER_3D(compound_backward_reference_cdf);
+  RESET_COUNTER_3D(single_reference_cdf);
+  RESET_COUNTER_2D(compound_prediction_mode_cdf);
+  RESET_COUNTER_2D(new_mv_cdf);
+  RESET_COUNTER_2D(zero_mv_cdf);
+  RESET_COUNTER_2D(reference_mv_cdf);
+  RESET_COUNTER_2D(ref_mv_index_cdf);
+  RESET_COUNTER_2D(is_inter_intra_cdf);
+  RESET_COUNTER_2D(inter_intra_mode_cdf);
+  RESET_COUNTER_2D(is_wedge_inter_intra_cdf);
+  RESET_COUNTER_2D(wedge_index_cdf);
+  RESET_COUNTER_2D(use_obmc_cdf);
+  RESET_COUNTER_2D(motion_mode_cdf);
+  RESET_COUNTER_2D(is_explicit_compound_type_cdf);
+  RESET_COUNTER_2D(is_compound_type_average_cdf);
+  RESET_COUNTER_2D(compound_type_cdf);
+  RESET_COUNTER_2D(interpolation_filter_cdf);
+  RESET_COUNTER_2D(mv_joint_cdf);
+  RESET_COUNTER_3D(mv_sign_cdf);
+  RESET_COUNTER_3D(mv_class_cdf);
+  RESET_COUNTER_3D(mv_class0_bit_cdf);
+  RESET_COUNTER_4D(mv_class0_fraction_cdf);
+  RESET_COUNTER_3D(mv_class0_high_precision_cdf);
+  RESET_COUNTER_4D(mv_bit_cdf);
+  RESET_COUNTER_3D(mv_fraction_cdf);
+  RESET_COUNTER_3D(mv_high_precision_cdf);
+}
+
+#undef RESET_COUNTER_1D
+#undef RESET_COUNTER_2D
+#undef RESET_COUNTER_3D
+#undef RESET_COUNTER_4D
+
+int SymbolDecoderContext::PartitionCdfSize(int block_size_log2) {
+  assert(block_size_log2 > 0);
+  assert(block_size_log2 < 6);
+
+  switch (block_size_log2) {
+    case 1:
+      return kPartitionSplit + 1;
+    case 5:
+      return kPartitionVerticalWithRightSplit + 1;
+    default:
+      return kMaxPartitionTypes;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/symbol_decoder_context.h b/src/symbol_decoder_context.h
new file mode 100644 (file)
index 0000000..1bea76c
--- /dev/null
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+#define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum {
+  kPartitionContexts = 4,
+  kSegmentIdContexts = 3,
+  kUsePredictedSegmentIdContexts = 3,
+  kSkipContexts = 3,
+  kSkipModeContexts = 3,
+  kBooleanFieldCdfSize = 3,
+  kDeltaSymbolCount = 4,  // Used for both delta_q and delta_lf.
+  kIntraModeContexts = 5,
+  kYModeContexts = 4,
+  kAngleDeltaSymbolCount = 2 * kMaxAngleDelta + 1,
+  kCflAlphaSignsSymbolCount = 8,
+  kCflAlphaContexts = 6,
+  kCflAlphaSymbolCount = 16,
+  kTxDepthContexts = 3,
+  kMaxTxDepthSymbolCount = 3,
+  kTxSplitContexts = 21,
+  kCoefficientQuantizerContexts = 4,
+  kNumSquareTransformSizes = 5,
+  kAllZeroContexts = 13,
+  kNumExtendedTransformSizes = 4,
+  kEobPtContexts = 2,
+  kEobPt16SymbolCount = 5,
+  kEobPt32SymbolCount = 6,
+  kEobPt64SymbolCount = 7,
+  kEobPt128SymbolCount = 8,
+  kEobPt256SymbolCount = 9,
+  kEobPt512SymbolCount = 10,
+  kEobPt1024SymbolCount = 11,
+  kEobExtraContexts = 9,
+  kCoeffBaseEobContexts = 4,
+  kCoeffBaseEobSymbolCount = 3,
+  kCoeffBaseContexts = 42,
+  kCoeffBaseSymbolCount = 4,
+  kCoeffBaseRangeContexts = 21,
+  kCoeffBaseRangeSymbolCount = 4,
+  kDcSignContexts = 3,
+  kPaletteBlockSizeContexts = 7,
+  kPaletteYModeContexts = 3,
+  kPaletteUVModeContexts = 2,
+  kPaletteSizeSymbolCount = 7,
+  kPaletteColorIndexContexts = 5,
+  kPaletteColorIndexSymbolCount = 8,
+  kIsInterContexts = 4,
+  kUseCompoundReferenceContexts = 5,
+  kCompoundReferenceTypeContexts = 5,
+  kReferenceContexts = 3,
+  kCompoundPredictionModeContexts = 8,
+  kNewMvContexts = 6,
+  kZeroMvContexts = 2,
+  kReferenceMvContexts = 6,
+  kRefMvIndexContexts = 3,
+  kInterIntraContexts = 3,
+  kWedgeIndexSymbolCount = 16,
+  kIsExplicitCompoundTypeContexts = 6,
+  kIsCompoundTypeAverageContexts = 6,
+  kInterpolationFilterContexts = 16,
+  kMvContexts = 2,
+  kMvClassSymbolCount = 11,
+  kMvFractionSymbolCount = 4,
+  kMvBitSymbolCount = 10,
+  kNumMvComponents = 2,
+};  // anonymous enum
+
+struct SymbolDecoderContext {
+  SymbolDecoderContext() = default;
+  explicit SymbolDecoderContext(int base_quantizer_index) {
+    Initialize(base_quantizer_index);
+  }
+
+  void Initialize(int base_quantizer_index);
+
+  // Partition related variables and functions.
+  static int PartitionCdfSize(int block_size_log2);
+
+  // Returns the cdf array index for inter_tx_type or intra_tx_type based on
+  // |tx_set|.
+  static int TxTypeIndex(TransformSet tx_set) {
+    assert(tx_set != kTransformSetDctOnly);
+    switch (tx_set) {
+      case kTransformSetInter1:
+      case kTransformSetIntra1:
+        return 0;
+      case kTransformSetInter2:
+      case kTransformSetIntra2:
+        return 1;
+      case kTransformSetInter3:
+        return 2;
+      default:
+        return -1;
+    }
+  }
+
+  // Resets the intra_frame_y_mode_cdf array to the default.
+  void ResetIntraFrameYModeCdf();
+
+  // Resets the symbol counters of all the CDF arrays to zero. Symbol counter is
+  // the last used element in the innermost dimension of each of the CDF array.
+  void ResetCounters();
+
+  // Note kMaxAlignment allows for aligned instructions to be used in the
+  // copies done in Initialize().
+  alignas(kMaxAlignment) uint16_t
+      partition_cdf[kBlockWidthCount][kPartitionContexts]
+                   [kMaxPartitionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
+                            [kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
+                 [kIntraPredictionModesUV + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+                       [kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts]
+                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                        [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                    [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                          [kCoeffBaseRangeContexts]
+                          [kCoeffBaseRangeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      restoration_type_cdf[kRestorationTypeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+                       [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      palette_y_size_cdf[kPaletteBlockSizeContexts]
+                        [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      palette_uv_size_cdf[kPaletteBlockSizeContexts]
+                         [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
+                             [kPaletteColorIndexContexts]
+                             [kPaletteColorIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      use_compound_reference_cdf[kUseCompoundReferenceContexts]
+                                [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_type_cdf[kCompoundReferenceTypeContexts]
+                                 [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3]
+                            [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_backward_reference_cdf[kReferenceContexts][2]
+                                     [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
+                                  [kNumCompoundInterPredictionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
+                                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_type_cdf[kMaxBlockSizes]
+                       [kNumExplicitCompoundPredictionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      interpolation_filter_cdf[kInterpolationFilterContexts]
+                              [kNumExplicitInterpolationFilters + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount]
+                            [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
+                [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
+                                                 [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+                           [kBooleanFieldCdfSize];
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
diff --git a/src/symbol_decoder_context_cdfs.inc b/src/symbol_decoder_context_cdfs.inc
new file mode 100644 (file)
index 0000000..509286f
--- /dev/null
@@ -0,0 +1,2509 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the CDF constant
+// definitions from the symbol decoder context functions.
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf
+    [kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = {
+        // width 8
+        {{13636, 7258, 2376, 0, 0},
+         {18840, 12913, 4228, 0, 0},
+         {20246, 9089, 4139, 0, 0},
+         {22872, 13985, 6915, 0, 0}},
+        // width 16
+        {{17171, 11839, 8197, 6062, 5104, 3947, 3167, 2197, 866, 0, 0},
+         {24843, 21725, 15983, 10298, 8797, 7725, 6117, 4067, 2934, 0, 0},
+         {27354, 19499, 17657, 12280, 10408, 8268, 7231, 6432, 651, 0, 0},
+         {30106, 26406, 24154, 11908, 9715, 7990, 6332, 4939, 1597, 0, 0}},
+        // width 32
+        {{14306, 11848, 9644, 5121, 4541, 3719, 3249, 2590, 1224, 0, 0},
+         {25079, 23708, 20712, 7776, 7108, 6586, 5817, 4727, 3716, 0, 0},
+         {26753, 23759, 22706, 8224, 7359, 6223, 5697, 5242, 721, 0, 0},
+         {31374, 30560, 29972, 4154, 3707, 3302, 2928, 2583, 869, 0, 0}},
+        // width 64
+        {{12631, 11221, 9690, 3202, 2931, 2507, 2244, 1876, 1044, 0, 0},
+         {26036, 25278, 23271, 4824, 4518, 4253, 3799, 3138, 2664, 0, 0},
+         {26823, 25105, 24420, 4085, 3651, 3019, 2704, 2470, 530, 0, 0},
+         {31898, 31556, 31281, 1570, 1374, 1194, 1025, 887, 436, 0, 0}},
+        // width 128
+        {{4869, 4549, 4239, 284, 229, 149, 129, 0, 0},
+         {26161, 25778, 24500, 708, 549, 430, 397, 0, 0},
+         {27339, 26092, 25646, 741, 541, 237, 186, 0, 0},
+         {32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = {
+        {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
+        {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
+        {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts]
+                                    [kBooleanFieldCdfSize] = {{16384, 0, 0},
+                                                              {16384, 0, 0},
+                                                              {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
+        {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = {
+        {147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}};
+
+// This constant is also used for DeltaLf and DeltaLfMulti.
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts]
+                              [kIntraPredictionModesY + 1] = {
+                                  {{17180, 15741, 13430, 12550, 12086, 11658,
+                                    10943, 9524, 8579, 4603, 3675, 2302, 0, 0},
+                                   {20752, 14702, 13252, 12465, 12049, 11324,
+                                    10880, 9736, 8334, 4110, 2596, 1359, 0, 0},
+                                   {22716, 21997, 10472, 9980, 9713, 9529, 8635,
+                                    7148, 6608, 3432, 2839, 1201, 0, 0},
+                                   {18677, 17362, 16326, 13960, 13632, 13222,
+                                    12770, 10672, 8022, 3183, 1810, 306, 0, 0},
+                                   {20646, 19503, 17165, 16267, 14159, 12735,
+                                    10377, 7185, 6331, 2507, 1695, 293, 0, 0}},
+                                  {{22745, 13183, 11920, 11328, 10936, 10008,
+                                    9679, 8745, 7387, 3754, 2286, 1332, 0, 0},
+                                   {26785, 8669, 8208, 7882, 7702, 6973, 6855,
+                                    6345, 5158, 2863, 1492, 974, 0, 0},
+                                   {25324, 19987, 12591, 12040, 11691, 11161,
+                                    10598, 9363, 8299, 4853, 3678, 2276, 0, 0},
+                                   {24231, 18079, 17336, 15681, 15360, 14596,
+                                    14360, 12943, 8119, 3615, 1672, 558, 0, 0},
+                                   {25225, 18537, 17272, 16573, 14863, 12051,
+                                    10784, 8252, 6767, 3093, 1787, 774, 0, 0}},
+                                  {{20155, 19177, 11385, 10764, 10456, 10191,
+                                    9367, 7713, 7039, 3230, 2463, 691, 0, 0},
+                                   {23081, 19298, 14262, 13538, 13164, 12621,
+                                    12073, 10706, 9549, 5025, 3557, 1861, 0, 0},
+                                   {26585, 26263, 6744, 6516, 6402, 6334, 5686,
+                                    4414, 4213, 2301, 1974, 682, 0, 0},
+                                   {22050, 21034, 17814, 15544, 15203, 14844,
+                                    14207, 11245, 8890, 3793, 2481, 516, 0, 0},
+                                   {23574, 22910, 16267, 15505, 14344, 13597,
+                                    11205, 6807, 6207, 2696, 2031, 305, 0, 0}},
+                                  {{20166, 18369, 17280, 14387, 13990, 13453,
+                                    13044, 11349, 7708, 3072, 1851, 359, 0, 0},
+                                   {24565, 18947, 18244, 15663, 15329, 14637,
+                                    14364, 13300, 7543, 3283, 1610, 426, 0, 0},
+                                   {24317, 23037, 17764, 15125, 14756, 14343,
+                                    13698, 11230, 8163, 3650, 2690, 750, 0, 0},
+                                   {25054, 23720, 23252, 16101, 15951, 15774,
+                                    15615, 14001, 6025, 2379, 1232, 240, 0, 0},
+                                   {23925, 22488, 21272, 17451, 16116, 14825,
+                                    13660, 10050, 6999, 2815, 1785, 283, 0, 0}},
+                                  {{20190, 19097, 16789, 15934, 13693, 11855,
+                                    9779, 7319, 6549, 2554, 1618, 291, 0, 0},
+                                   {23205, 19142, 17688, 16876, 15012, 11905,
+                                    10561, 8532, 7388, 3115, 1625, 491, 0, 0},
+                                   {24412, 23867, 15152, 14512, 13418, 12662,
+                                    10170, 6821, 6302, 2868, 2245, 507, 0, 0},
+                                   {21933, 20953, 19644, 16726, 15750, 14729,
+                                    13821, 10015, 8153, 3279, 1885, 286, 0, 0},
+                                   {25150, 24480, 22909, 22259, 17382, 14111,
+                                    9865, 3992, 3588, 1413, 966, 175, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = {
+        {9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419,
+         0, 0},
+        {14095, 12923, 10137, 9450, 8818, 8119, 7241, 5404, 4616, 3067, 2784,
+         1916, 0, 0},
+        {12998, 11789, 9372, 8829, 8527, 8114, 7632, 5695, 4938, 3408, 3038,
+         2109, 0, 0},
+        {12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752,
+         4719, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] =
+        {{30588, 27736, 25201, 9992, 5779, 2551, 0, 0},
+         {30467, 27160, 23967, 9281, 5794, 2438, 0, 0},
+         {28988, 21750, 19069, 13414, 9685, 1482, 0, 0},
+         {28187, 21542, 17621, 15630, 10934, 4371, 0, 0},
+         {31031, 21841, 18259, 13180, 10023, 3945, 0, 0},
+         {30104, 22592, 20283, 15118, 11168, 2273, 0, 0},
+         {30528, 21672, 17315, 12427, 10207, 3851, 0, 0},
+         {29163, 22340, 20309, 15092, 11524, 2113, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY]
+                     [kIntraPredictionModesUV + 1] = {
+                         // CFL not allowed.
+                         {{10137, 8616, 7390, 7107, 6782, 6248, 5713, 4845,
+                           4524, 2709, 1827, 807, 0, 0},
+                          {23255, 5887, 5795, 5722, 5650, 5104, 5029, 4944,
+                           4409, 3263, 2968, 972, 0, 0},
+                          {22923, 22853, 4105, 4064, 4011, 3988, 3570, 2946,
+                           2914, 2004, 991, 739, 0, 0},
+                          {19129, 18871, 18597, 7437, 7162, 7041, 6815, 5620,
+                           4191, 2156, 1413, 275, 0, 0},
+                          {23004, 22933, 22838, 22814, 7382, 5715, 4810, 4620,
+                           4525, 1667, 1024, 405, 0, 0},
+                          {20943, 19179, 19091, 19048, 17720, 3555, 3467, 3310,
+                           3057, 1607, 1327, 218, 0, 0},
+                          {18593, 18369, 16160, 15947, 15050, 14993, 4217, 2568,
+                           2523, 931, 426, 101, 0, 0},
+                          {19883, 19730, 17790, 17178, 17095, 17020, 16592,
+                           3640, 3501, 2125, 807, 307, 0, 0},
+                          {20742, 19107, 18894, 17463, 17278, 17042, 16773,
+                           16495, 4325, 2380, 2001, 352, 0, 0},
+                          {13716, 12928, 12189, 11852, 11618, 11301, 10883,
+                           10049, 9594, 3907, 2389, 593, 0, 0},
+                          {14141, 13119, 11794, 11549, 11276, 10952, 10569,
+                           9649, 9241, 5715, 1371, 620, 0, 0},
+                          {15742, 13764, 12771, 12429, 12182, 11665, 11419,
+                           10861, 10286, 6872, 6227, 949, 0, 0},
+                          {20644, 19009, 17809, 17776, 17761, 17717, 17690,
+                           17602, 17513, 17015, 16729, 16162, 0, 0}},
+                         // CFL allowed.
+                         {{22361, 21560, 19868, 19587, 18945, 18593, 17869,
+                           17112, 16782, 12682, 11773, 10313, 8556, 0, 0},
+                          {28236, 12988, 12711, 12553, 12340, 11697, 11569,
+                           11317, 10669, 8540, 8075, 5736, 3296, 0, 0},
+                          {27495, 27389, 12591, 12498, 12383, 12329, 11819,
+                           11073, 10994, 9630, 8512, 8065, 6089, 0, 0},
+                          {26028, 25601, 25106, 18616, 18232, 17983, 17734,
+                           16027, 14397, 11248, 10562, 9379, 8586, 0, 0},
+                          {27781, 27400, 26840, 26700, 13654, 12453, 10911,
+                           10515, 10357, 7857, 7388, 6741, 6392, 0, 0},
+                          {27398, 25879, 25521, 25375, 23270, 11654, 11366,
+                           11015, 10787, 7988, 7382, 6251, 5592, 0, 0},
+                          {27952, 27807, 25564, 25442, 24003, 23838, 12599,
+                           12086, 11965, 9580, 9005, 8313, 7828, 0, 0},
+                          {26160, 26028, 24239, 23719, 23511, 23412, 23033,
+                           13941, 13709, 10432, 9564, 8804, 7975, 0, 0},
+                          {26770, 25349, 24987, 23835, 23513, 23219, 23015,
+                           22351, 13870, 10274, 9629, 8004, 6779, 0, 0},
+                          {22108, 21470, 20218, 19811, 19446, 19144, 18728,
+                           17764, 17234, 12054, 10979, 9325, 7907, 0, 0},
+                          {22246, 21238, 20216, 19805, 19390, 18989, 18523,
+                           17533, 16866, 12666, 10072, 8994, 6930, 0, 0},
+                          {22669, 22077, 20129, 19719, 19382, 19103, 18643,
+                           17605, 17132, 13092, 12294, 9249, 7560, 0, 0},
+                          {29624, 27681, 25386, 25264, 25175, 25078, 24967,
+                           24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
+        31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = {
+        {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44,
+         0, 0},
+        {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88,
+         84, 0, 0},
+        {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75,
+         71, 0, 0},
+        {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0,
+         0},
+        {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192,
+         175, 146, 0, 0},
+        {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174,
+         146, 112, 108, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0},
+        {24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0},
+        {23374, 0, 0}, {20360, 0, 0}, {18467, 0, 0}, {16384, 0, 0},
+        {14667, 0, 0}, {20012, 0, 0}, {10425, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
+        23819, 19992, 15557, 3210, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = {
+        {{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}},
+        {{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}},
+        {{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}},
+        {{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = {
+        {4187, 0, 0},  {8922, 0, 0},  {11921, 0, 0}, {8453, 0, 0},
+        {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0},
+        {21763, 0, 0}, {5589, 0, 0},  {12764, 0, 0}, {21487, 0, 0},
+        {6219, 0, 0},  {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0},
+        {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0},  {10367, 0, 0},
+        {16680, 0, 0}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
+                                 [kNumSquareTransformSizes][kAllZeroContexts]
+                                 [kBooleanFieldCdfSize] = {
+  {
+    {{919, 0, 0}, {26876, 0, 0}, {20656, 0, 0}, {10833, 0, 0}, {12479, 0, 0},
+     {5295, 0, 0}, {281, 0, 0}, {25114, 0, 0}, {13295, 0, 0}, {2784, 0, 0},
+     {22807, 0, 0}, {2526, 0, 0}, {651, 0, 0}},
+    {{1220, 0, 0}, {31219, 0, 0}, {22638, 0, 0}, {16112, 0, 0}, {14177, 0, 0},
+     {6460, 0, 0}, {231, 0, 0}, {27365, 0, 0}, {14672, 0, 0}, {2765, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{2811, 0, 0}, {27377, 0, 0}, {14729, 0, 0}, {9202, 0, 0}, {10337, 0, 0},
+     {6946, 0, 0}, {571, 0, 0}, {28990, 0, 0}, {17432, 0, 0}, {3787, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{14848, 0, 0}, {30950, 0, 0}, {25486, 0, 0}, {7495, 0, 0}, {21845, 0, 0},
+     {1214, 0, 0}, {144, 0, 0}, {31402, 0, 0}, {17140, 0, 0}, {2306, 0, 0},
+     {32622, 0, 0}, {27636, 0, 0}, {1111, 0, 0}},
+    {{26460, 0, 0}, {32651, 0, 0}, {31130, 0, 0}, {30607, 0, 0}, {16384, 0, 0},
+     {21845, 0, 0}, {2521, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{2397, 0, 0}, {25198, 0, 0}, {19613, 0, 0}, {12017, 0, 0}, {11799, 0, 0},
+     {5701, 0, 0}, {755, 0, 0}, {27273, 0, 0}, {14826, 0, 0}, {4488, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{986, 0, 0}, {30932, 0, 0}, {22079, 0, 0}, {15164, 0, 0}, {11146, 0, 0},
+     {5250, 0, 0}, {369, 0, 0}, {28349, 0, 0}, {16474, 0, 0}, {4423, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{867, 0, 0}, {22457, 0, 0}, {14721, 0, 0}, {7962, 0, 0}, {9480, 0, 0},
+     {4854, 0, 0}, {472, 0, 0}, {28553, 0, 0}, {17012, 0, 0}, {4427, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{6042, 0, 0}, {31723, 0, 0}, {21065, 0, 0}, {12178, 0, 0}, {14214, 0, 0},
+     {6798, 0, 0}, {830, 0, 0}, {27185, 0, 0}, {11455, 0, 0}, {3378, 0, 0},
+     {32127, 0, 0}, {10503, 0, 0}, {1316, 0, 0}},
+    {{6184, 0, 0}, {32580, 0, 0}, {23921, 0, 0}, {8249, 0, 0}, {9830, 0, 0},
+     {2185, 0, 0}, {160, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{3154, 0, 0}, {23700, 0, 0}, {19844, 0, 0}, {13230, 0, 0}, {15031, 0, 0},
+     {8149, 0, 0}, {2126, 0, 0}, {28649, 0, 0}, {16742, 0, 0}, {7111, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{811, 0, 0}, {29538, 0, 0}, {21615, 0, 0}, {14645, 0, 0}, {12625, 0, 0},
+     {6232, 0, 0}, {782, 0, 0}, {29718, 0, 0}, {18165, 0, 0}, {7613, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{405, 0, 0}, {22076, 0, 0}, {13678, 0, 0}, {8411, 0, 0}, {8326, 0, 0},
+     {4456, 0, 0}, {599, 0, 0}, {29120, 0, 0}, {17078, 0, 0}, {5953, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{2099, 0, 0}, {28936, 0, 0}, {21105, 0, 0}, {13879, 0, 0}, {12986, 0, 0},
+     {9455, 0, 0}, {1438, 0, 0}, {27644, 0, 0}, {14049, 0, 0}, {4300, 0, 0},
+     {29686, 0, 0}, {11786, 0, 0}, {3325, 0, 0}},
+    {{4195, 0, 0}, {29585, 0, 0}, {14966, 0, 0}, {6791, 0, 0}, {6091, 0, 0},
+     {4936, 0, 0}, {381, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{5881, 0, 0}, {26039, 0, 0}, {22407, 0, 0}, {15326, 0, 0}, {17723, 0, 0},
+     {10290, 0, 0}, {3696, 0, 0}, {30055, 0, 0}, {20907, 0, 0}, {11995, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{865, 0, 0}, {30724, 0, 0}, {25240, 0, 0}, {18150, 0, 0}, {16586, 0, 0},
+     {8600, 0, 0}, {1731, 0, 0}, {29982, 0, 0}, {21574, 0, 0}, {12613, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{258, 0, 0}, {24338, 0, 0}, {15450, 0, 0}, {8614, 0, 0}, {9094, 0, 0},
+     {3979, 0, 0}, {629, 0, 0}, {29328, 0, 0}, {19651, 0, 0}, {10066, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{1097, 0, 0}, {30712, 0, 0}, {21022, 0, 0}, {15916, 0, 0}, {14133, 0, 0},
+     {8053, 0, 0}, {1284, 0, 0}, {28112, 0, 0}, {16694, 0, 0}, {8064, 0, 0},
+     {30962, 0, 0}, {18123, 0, 0}, {7432, 0, 0}},
+    {{1229, 0, 0}, {24335, 0, 0}, {12192, 0, 0}, {4864, 0, 0}, {4916, 0, 0},
+     {2742, 0, 0}, {327, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes +
+                                                          1] = {
+        {{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920,
+          8834, 7294, 5041, 3853, 2137, 0, 0},
+         {31123, 30195, 27990, 27057, 24961, 24146, 22246, 17411, 15094, 12360,
+          10251, 7758, 5652, 3912, 2019, 0, 0},
+         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+          10240, 8192, 6144, 4096, 2048, 0, 0},
+         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+          10240, 8192, 6144, 4096, 2048, 0, 0}},
+        {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+         // Only 16x16 is used in this case.
+         {31998, 30347, 27543, 19861, 16949, 13841, 11207, 8679, 6173, 4242,
+          2239, 0},
+         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+        {{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf
+    [2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+    [kNumTransformTypes + 1] = {
+        {{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0},
+          {32204, 29433, 23059, 21898, 14625, 4674, 0, 0},
+          {32096, 29521, 29092, 20786, 13353, 9641, 0, 0},
+          {27489, 18883, 17281, 14724, 9241, 2516, 0, 0},
+          {28345, 26694, 24783, 22352, 7075, 3470, 0, 0},
+          {31282, 28527, 23308, 22106, 16312, 5074, 0, 0},
+          {32329, 29930, 29246, 26031, 14710, 9014, 0, 0},
+          {31578, 28535, 27913, 21098, 12487, 8391, 0, 0},
+          {31723, 28456, 24121, 22609, 14124, 3433, 0, 0},
+          {32566, 29034, 28021, 25470, 15641, 8752, 0, 0},
+          {32321, 28456, 25949, 23884, 16758, 8910, 0, 0},
+          {32491, 28399, 27513, 23863, 16303, 10497, 0, 0},
+          {29359, 27332, 22169, 17169, 13081, 8728, 0, 0}},
+         {{30898, 19026, 18238, 16270, 8998, 5070, 0, 0},
+          {32442, 23972, 18136, 17689, 13496, 5282, 0, 0},
+          {32284, 25192, 25056, 18325, 13609, 10177, 0, 0},
+          {31642, 17428, 16873, 15745, 11872, 2489, 0, 0},
+          {32113, 27914, 27519, 26855, 10669, 5630, 0, 0},
+          {31469, 26310, 23883, 23478, 17917, 7271, 0, 0},
+          {32457, 27473, 27216, 25883, 16661, 10096, 0, 0},
+          {31885, 24709, 24498, 21510, 15479, 11219, 0, 0},
+          {32027, 25188, 23450, 22423, 16080, 3722, 0, 0},
+          {32658, 25362, 24853, 23573, 16727, 9439, 0, 0},
+          {32405, 24794, 23411, 22095, 17139, 8294, 0, 0},
+          {32615, 25121, 24656, 22832, 17461, 12772, 0, 0},
+          {29257, 26436, 21603, 17433, 13445, 9174, 0, 0}}},
+        {{{26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0}},
+         {{26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0}},
+         {{31641, 19954, 9996, 5285, 0, 0},
+          {32623, 26007, 20788, 6101, 0, 0},
+          {32406, 26881, 21090, 16043, 0, 0},
+          {32383, 17555, 14181, 2075, 0, 0},
+          {32743, 29854, 9634, 4865, 0, 0},
+          {32708, 28298, 21019, 8777, 0, 0},
+          {32731, 29436, 18257, 11320, 0, 0},
+          {32611, 26448, 19732, 15329, 0, 0},
+          {32649, 26049, 19862, 3372, 0, 0},
+          {32721, 27231, 20192, 11269, 0, 0},
+          {32499, 26692, 21510, 9653, 0, 0},
+          {32685, 27153, 20767, 15540, 0, 0},
+          {30800, 27212, 20745, 14221, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt16SymbolCount + 1] = {
+                          {{{31928, 31729, 30788, 27873, 0, 0},
+                            {32398, 32097, 30885, 28297, 0, 0}},
+                           {{29521, 27818, 23080, 18205, 0, 0},
+                            {30864, 29414, 25005, 18121, 0, 0}}},
+                          {{{30643, 30217, 27603, 23822, 0, 0},
+                            {32255, 32003, 30909, 26429, 0, 0}},
+                           {{25131, 23270, 18509, 13660, 0, 0},
+                            {30271, 28672, 23902, 15775, 0, 0}}},
+                          {{{28752, 27871, 23887, 17800, 0, 0},
+                            {32052, 31663, 30122, 22712, 0, 0}},
+                           {{21629, 19498, 14527, 9202, 0, 0},
+                            {29576, 27736, 22471, 13013, 0, 0}}},
+                          {{{26060, 23810, 18022, 10635, 0, 0},
+                            {31546, 30694, 27985, 17358, 0, 0}},
+                           {{13193, 11002, 6724, 3059, 0, 0},
+                            {25471, 22001, 13495, 4574, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt32SymbolCount + 1] = {
+                          {{{32368, 32248, 31791, 30666, 26226, 0, 0},
+                            {32558, 32363, 31453, 29442, 25231, 0, 0}},
+                           {{30132, 28495, 25180, 20974, 12367, 0, 0},
+                            {30982, 29589, 25866, 21411, 13714, 0, 0}}},
+                          {{{31779, 31519, 30749, 28617, 21983, 0, 0},
+                            {32455, 32327, 31669, 29851, 24206, 0, 0}},
+                           {{24374, 22416, 18836, 13913, 6754, 0, 0},
+                            {30190, 28644, 24587, 19098, 8534, 0, 0}}},
+                          {{{30253, 29765, 28316, 24606, 16727, 0, 0},
+                            {32194, 31947, 30932, 27679, 19640, 0, 0}},
+                           {{19300, 16465, 12407, 7663, 3487, 0, 0},
+                            {29226, 27266, 22353, 16008, 7124, 0, 0}}},
+                          {{{28151, 27059, 24322, 19184, 9633, 0, 0},
+                            {31612, 31066, 29093, 23494, 12229, 0, 0}},
+                           {{10682, 8486, 5758, 2998, 1025, 0, 0},
+                            {25069, 21871, 11877, 5842, 1140, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt64SymbolCount + 1] = {
+                          {{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0},
+                            {32433, 32038, 31309, 27274, 24013, 19771, 0, 0}},
+                           {{29263, 27464, 22682, 18954, 15084, 9398, 0, 0},
+                            {31205, 30068, 27892, 21857, 18062, 10288, 0, 0}}},
+                          {{{31508, 31322, 30515, 29056, 26116, 19399, 0, 0},
+                            {32367, 32163, 31739, 30205, 26923, 20142, 0, 0}},
+                           {{24159, 22156, 18144, 14054, 10154, 3744, 0, 0},
+                            {30845, 29641, 26901, 23065, 18491, 5668, 0, 0}}},
+                          {{{30394, 29996, 28185, 25492, 20480, 13062, 0, 0},
+                            {32271, 31958, 31453, 29768, 25764, 17127, 0, 0}},
+                           {{17718, 15642, 11358, 7882, 4612, 2042, 0, 0},
+                            {28734, 26478, 22533, 17786, 11554, 4277, 0, 0}}},
+                          {{{26461, 25227, 20708, 16410, 10215, 4903, 0, 0},
+                            {31479, 30448, 28797, 24842, 18615, 8477, 0, 0}},
+                           {{8556, 7060, 4500, 2733, 1461, 719, 0, 0},
+                            {24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+    [kEobPt128SymbolCount + 1] = {
+        {{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0},
+          {32397, 32069, 31514, 27938, 23289, 20206, 15271, 0, 0}},
+         {{27523, 25312, 19888, 16916, 12735, 8836, 5160, 0, 0},
+          {30714, 29296, 26899, 18536, 14526, 12178, 6016, 0, 0}}},
+        {{{32083, 31835, 31280, 30054, 28002, 24206, 13514, 0, 0},
+          {32551, 32416, 32150, 30465, 27507, 22799, 15296, 0, 0}},
+         {{24723, 21568, 17271, 13173, 8820, 5360, 1830, 0, 0},
+          {30458, 28608, 25297, 17771, 14837, 12000, 2528, 0, 0}}},
+        {{{31402, 31030, 30241, 27752, 23413, 16971, 8125, 0, 0},
+          {32414, 32210, 31824, 30008, 25481, 18731, 10989, 0, 0}},
+         {{19141, 16522, 12595, 8339, 4820, 2353, 905, 0, 0},
+          {26493, 22879, 17999, 9604, 4780, 2275, 496, 0, 0}}},
+        {{{29296, 27883, 25279, 20287, 14251, 8232, 3133, 0, 0},
+          {31882, 31037, 29497, 24299, 17199, 10642, 4385, 0, 0}},
+         {{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0},
+          {23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+    [kEobPt256SymbolCount + 1] = {
+        {{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0},
+          {31770, 30918, 29770, 27164, 15427, 12880, 9869, 7185, 0, 0}},
+         {{30248, 29528, 26816, 23898, 20191, 15210, 12814, 8600, 0, 0},
+          {30565, 28638, 25333, 22029, 12116, 9087, 7159, 5507, 0, 0}}},
+        {{{31320, 30659, 28617, 26505, 23439, 19508, 14824, 9468, 0, 0},
+          {32369, 31749, 31019, 29730, 22324, 17222, 10029, 5474, 0, 0}},
+         {{26366, 24620, 20145, 17696, 14040, 9921, 6321, 3391, 0, 0},
+          {31094, 29516, 27034, 22609, 10371, 8966, 7947, 1828, 0, 0}}},
+        {{{29679, 28848, 26730, 23308, 18502, 12887, 7002, 3592, 0, 0},
+          {31684, 30410, 29280, 27646, 21285, 14665, 6745, 2969, 0, 0}},
+         {{21254, 18974, 15288, 12014, 8407, 5390, 3276, 1491, 0, 0},
+          {26197, 23158, 17252, 10942, 3676, 1939, 926, 60, 0, 0}}},
+        {{{27420, 25655, 20948, 16844, 10662, 5991, 2434, 1011, 0, 0},
+          {30315, 28294, 26461, 23991, 16294, 9793, 3768, 1221, 0, 0}},
+         {{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0},
+          {22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] =
+        {{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0},
+          {27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}},
+         {{31538, 30490, 27733, 24992, 20897, 17422, 13178, 8184, 4019, 0, 0},
+          {25503, 22789, 16949, 13518, 10988, 8922, 6290, 4372, 957, 0, 0}},
+         {{30144, 28832, 26288, 23082, 18789, 15042, 9501, 4358, 1690, 0, 0},
+          {20753, 17999, 13180, 10716, 8546, 6956, 5468, 3549, 654, 0, 0}},
+         {{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0},
+          {11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                        [kEobPt1024SymbolCount + 1] = {
+                            {{32375, 32347, 32017, 31145, 29608, 26416, 19423,
+                              14721, 10197, 6938, 0, 0},
+                             {30903, 30780, 29838, 28526, 22235, 16230, 11414,
+                              5513, 4222, 984, 0, 0}},
+                            {{32072, 31820, 29623, 27066, 23062, 19551, 14917,
+                              10912, 7076, 4734, 0, 0},
+                             {30096, 29177, 23438, 15684, 10043, 8484, 6241,
+                              4741, 4391, 1892, 0, 0}},
+                            {{29984, 28937, 25727, 22247, 17921, 13924, 9613,
+                              6086, 3539, 1723, 0, 0},
+                             {23191, 20302, 15029, 12018, 10707, 9553, 8167,
+                              7285, 6925, 712, 0, 0}},
+                            {{26070, 24434, 20807, 17006, 12582, 8906, 5334,
+                              3442, 1686, 718, 0, 0},
+                             {12199, 10342, 7199, 5909, 4715, 3855, 3282, 3044,
+                              2961, 198, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
+                                  [kNumSquareTransformSizes][kNumPlaneTypes]
+                                  [kEobExtraContexts][kBooleanFieldCdfSize] = {
+  {
+    {
+      {{15807, 0, 0}, {15545, 0, 0}, {25147, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{13699, 0, 0}, {10243, 0, 0}, {19391, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12367, 0, 0}, {15743, 0, 0}, {19923, 0, 0}, {19895, 0, 0},
+       {18674, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12087, 0, 0}, {12067, 0, 0}, {17518, 0, 0}, {17751, 0, 0},
+       {17840, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{8863, 0, 0}, {15574, 0, 0}, {16598, 0, 0}, {15073, 0, 0},
+       {18942, 0, 0}, {16958, 0, 0}, {20732, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{8809, 0, 0}, {11969, 0, 0}, {13747, 0, 0}, {16565, 0, 0},
+       {14882, 0, 0}, {18624, 0, 0}, {20758, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{5369, 0, 0}, {16441, 0, 0}, {14697, 0, 0}, {13184, 0, 0},
+       {12047, 0, 0}, {14336, 0, 0}, {13208, 0, 0}, {22618, 0, 0},
+       {23963, 0, 0}},
+      {{7836, 0, 0}, {11935, 0, 0}, {20741, 0, 0}, {16098, 0, 0},
+       {12854, 0, 0}, {17662, 0, 0}, {15106, 0, 0}, {18985, 0, 0},
+       {4012, 0, 0}}
+    },
+    {
+      {{9362, 0, 0}, {10923, 0, 0}, {14336, 0, 0}, {16384, 0, 0},
+       {15672, 0, 0}, {20207, 0, 0}, {15448, 0, 0}, {10373, 0, 0},
+       {11398, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{15297, 0, 0}, {12545, 0, 0}, {21411, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12433, 0, 0}, {11101, 0, 0}, {17950, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12338, 0, 0}, {12106, 0, 0}, {17401, 0, 0}, {15798, 0, 0},
+       {18111, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10651, 0, 0}, {10740, 0, 0}, {14118, 0, 0}, {16726, 0, 0},
+       {16883, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{10359, 0, 0}, {11756, 0, 0}, {17118, 0, 0}, {15373, 0, 0},
+       {17299, 0, 0}, {12563, 0, 0}, {13257, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{8548, 0, 0}, {10288, 0, 0}, {15031, 0, 0}, {13852, 0, 0},
+       {13500, 0, 0}, {14356, 0, 0}, {13924, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{6777, 0, 0}, {12454, 0, 0}, {15037, 0, 0}, {13090, 0, 0},
+       {14119, 0, 0}, {15461, 0, 0}, {10970, 0, 0}, {15219, 0, 0},
+       {17138, 0, 0}},
+      {{6183, 0, 0}, {11299, 0, 0}, {12336, 0, 0}, {15033, 0, 0},
+       {13488, 0, 0}, {17533, 0, 0}, {12471, 0, 0}, {10297, 0, 0},
+       {3771, 0, 0}}
+    },
+    {
+      {{6163, 0, 0}, {21464, 0, 0}, {16042, 0, 0}, {16208, 0, 0},
+       {11902, 0, 0}, {9244, 0, 0}, {12890, 0, 0}, {19299, 0, 0},
+       {9684, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{13785, 0, 0}, {12256, 0, 0}, {17883, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12678, 0, 0}, {13324, 0, 0}, {15482, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{13629, 0, 0}, {11281, 0, 0}, {13809, 0, 0}, {11858, 0, 0},
+       {13679, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12232, 0, 0}, {12104, 0, 0}, {12143, 0, 0}, {13645, 0, 0},
+       {17906, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12935, 0, 0}, {11266, 0, 0}, {15283, 0, 0}, {12501, 0, 0},
+       {14415, 0, 0}, {9439, 0, 0}, {11290, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10727, 0, 0}, {9334, 0, 0}, {12767, 0, 0}, {12214, 0, 0},
+       {11817, 0, 0}, {12623, 0, 0}, {17206, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{9456, 0, 0}, {11161, 0, 0}, {16242, 0, 0}, {13811, 0, 0},
+       {14734, 0, 0}, {13834, 0, 0}, {8521, 0, 0}, {15847, 0, 0},
+       {15688, 0, 0}},
+      {{6189, 0, 0}, {7858, 0, 0}, {14131, 0, 0}, {12968, 0, 0},
+       {12380, 0, 0}, {22881, 0, 0}, {17126, 0, 0}, {2570, 0, 0},
+       {8047, 0, 0}}
+    },
+    {
+      {{5770, 0, 0}, {16031, 0, 0}, {14930, 0, 0}, {13846, 0, 0},
+       {13253, 0, 0}, {14132, 0, 0}, {15435, 0, 0}, {16992, 0, 0},
+       {10110, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{12591, 0, 0}, {11979, 0, 0}, {12506, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{11352, 0, 0}, {11913, 0, 0}, {9358, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12530, 0, 0}, {11711, 0, 0}, {13609, 0, 0}, {10431, 0, 0},
+       {12609, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12643, 0, 0}, {12209, 0, 0}, {11061, 0, 0}, {10472, 0, 0},
+       {15435, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12827, 0, 0}, {12241, 0, 0}, {11298, 0, 0}, {10281, 0, 0},
+       {13210, 0, 0}, {10414, 0, 0}, {12437, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10016, 0, 0}, {7762, 0, 0}, {10693, 0, 0}, {11192, 0, 0},
+       {15028, 0, 0}, {11078, 0, 0}, {13557, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{11326, 0, 0}, {10410, 0, 0}, {14265, 0, 0}, {12477, 0, 0},
+       {12823, 0, 0}, {11474, 0, 0}, {11590, 0, 0}, {13368, 0, 0},
+       {22212, 0, 0}},
+      {{8120, 0, 0}, {7819, 0, 0}, {12060, 0, 0}, {8863, 0, 0},
+       {12267, 0, 0}, {23210, 0, 0}, {23345, 0, 0}, {2403, 0, 0},
+       {13515, 0, 0}}
+    },
+    {
+      {{6704, 0, 0}, {10670, 0, 0}, {13155, 0, 0}, {12243, 0, 0},
+       {15173, 0, 0}, {16150, 0, 0}, {12271, 0, 0}, {13779, 0, 0},
+       {17255, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  }
+};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
+                                      [kNumSquareTransformSizes][kNumPlaneTypes]
+                                      [kCoeffBaseEobContexts]
+                                      [kCoeffBaseEobSymbolCount + 1] = {
+  {
+    {
+      {{14931, 3713, 0, 0}, {3168, 1322, 0, 0}, {1924, 890, 0, 0},
+       {7842, 3820, 0, 0}},
+      {{11403, 2742, 0, 0}, {2256, 345, 0, 0}, {1110, 147, 0, 0},
+       {3138, 887, 0, 0}}
+    },
+    {
+      {{27051, 6291, 0, 0}, {2277, 1065, 0, 0}, {1218, 610, 0, 0},
+       {3120, 1277, 0, 0}},
+      {{20160, 4948, 0, 0}, {2088, 543, 0, 0}, {1959, 433, 0, 0},
+       {1469, 345, 0, 0}}
+    },
+    {
+      {{30982, 20156, 0, 0}, {2105, 1143, 0, 0}, {429, 300, 0, 0},
+       {1620, 935, 0, 0}},
+      {{13911, 8903, 0, 0}, {1340, 340, 0, 0}, {1024, 395, 0, 0},
+       {993, 242, 0, 0}}
+    },
+    {
+      {{30981, 30236, 0, 0}, {1936, 1106, 0, 0}, {944, 86, 0, 0},
+       {635, 199, 0, 0}},
+      {{19017, 10533, 0, 0}, {679, 359, 0, 0}, {5684, 4848, 0, 0},
+       {3477, 174, 0, 0}}
+    },
+    {
+      {{31043, 29319, 0, 0}, {1666, 833, 0, 0}, {311, 155, 0, 0},
+       {356, 119, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{15208, 2880, 0, 0}, {3097, 1219, 0, 0}, {1761, 712, 0, 0},
+       {5482, 2762, 0, 0}},
+      {{6174, 1556, 0, 0}, {1560, 186, 0, 0}, {933, 131, 0, 0},
+       {2173, 562, 0, 0}}
+    },
+    {
+      {{17529, 2836, 0, 0}, {1453, 673, 0, 0}, {638, 334, 0, 0},
+       {1904, 772, 0, 0}},
+      {{6489, 1800, 0, 0}, {1626, 273, 0, 0}, {1055, 228, 0, 0},
+       {839, 174, 0, 0}}
+    },
+    {
+      {{30124, 7570, 0, 0}, {730, 317, 0, 0}, {129, 73, 0, 0},
+       {602, 250, 0, 0}},
+      {{15581, 5100, 0, 0}, {1054, 218, 0, 0}, {485, 90, 0, 0},
+       {838, 205, 0, 0}}
+    },
+    {
+      {{31724, 30511, 0, 0}, {2013, 845, 0, 0}, {560, 75, 0, 0},
+       {524, 153, 0, 0}},
+      {{11451, 6561, 0, 0}, {3635, 1900, 0, 0}, {3457, 1537, 0, 0},
+       {3111, 1681, 0, 0}}
+    },
+    {
+      {{32290, 30934, 0, 0}, {1763, 781, 0, 0}, {451, 44, 0, 0},
+       {1903, 120, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{12676, 1994, 0, 0}, {2073, 748, 0, 0}, {1637, 665, 0, 0},
+       {4102, 1898, 0, 0}},
+      {{5510, 1673, 0, 0}, {964, 145, 0, 0}, {1005, 240, 0, 0},
+       {1330, 262, 0, 0}}
+    },
+    {
+      {{14719, 2279, 0, 0}, {1062, 482, 0, 0}, {605, 295, 0, 0},
+       {1218, 584, 0, 0}},
+      {{5652, 1926, 0, 0}, {797, 170, 0, 0}, {680, 192, 0, 0},
+       {701, 104, 0, 0}}
+    },
+    {
+      {{19914, 3675, 0, 0}, {496, 210, 0, 0}, {101, 39, 0, 0},
+       {462, 183, 0, 0}},
+      {{7292, 2402, 0, 0}, {599, 81, 0, 0}, {289, 79, 0, 0},
+       {1095, 134, 0, 0}}
+    },
+    {
+      {{29959, 13467, 0, 0}, {563, 146, 0, 0}, {430, 38, 0, 0},
+       {982, 152, 0, 0}},
+      {{10031, 3663, 0, 0}, {1958, 406, 0, 0}, {2754, 141, 0, 0},
+       {2240, 194, 0, 0}}
+    },
+    {
+      {{31833, 29386, 0, 0}, {1979, 859, 0, 0}, {302, 12, 0, 0},
+       {1908, 255, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{10271, 1570, 0, 0}, {1053, 273, 0, 0}, {1162, 431, 0, 0},
+       {2380, 778, 0, 0}},
+      {{4891, 1184, 0, 0}, {598, 40, 0, 0}, {613, 80, 0, 0},
+       {549, 66, 0, 0}}
+    },
+    {
+      {{11311, 1725, 0, 0}, {817, 285, 0, 0}, {615, 206, 0, 0},
+       {1295, 553, 0, 0}},
+      {{5210, 1617, 0, 0}, {748, 128, 0, 0}, {671, 193, 0, 0},
+       {526, 49, 0, 0}}
+    },
+    {
+      {{12788, 2177, 0, 0}, {549, 171, 0, 0}, {187, 62, 0, 0},
+       {965, 481, 0, 0}},
+      {{6295, 2261, 0, 0}, {337, 45, 0, 0}, {572, 157, 0, 0},
+       {1180, 240, 0, 0}}
+    },
+    {
+      {{8121, 2305, 0, 0}, {356, 73, 0, 0}, {300, 48, 0, 0},
+       {1499, 245, 0, 0}},
+      {{4286, 1263, 0, 0}, {616, 67, 0, 0}, {1036, 170, 0, 0},
+       {1001, 56, 0, 0}}
+    },
+    {
+      {{20410, 7791, 0, 0}, {1437, 383, 0, 0}, {134, 12, 0, 0},
+       {2357, 220, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf
+    [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+    [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = {
+        {{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0},
+           {20172, 6644, 2275, 0, 0},   {23322, 11650, 5763, 0, 0},
+           {26460, 17627, 11489, 0, 0}, {30305, 26411, 22985, 0, 0},
+           {12101, 2222, 839, 0, 0},    {19725, 6645, 2634, 0, 0},
+           {24617, 14011, 7990, 0, 0},  {27513, 19929, 14136, 0, 0},
+           {29948, 25562, 21607, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {17032, 5215, 2164, 0, 0},
+           {21558, 8974, 3981, 0, 0},   {26821, 18894, 13067, 0, 0},
+           {28553, 23445, 18877, 0, 0}, {29935, 26306, 22709, 0, 0},
+           {13163, 2375, 1186, 0, 0},   {19245, 6516, 2520, 0, 0},
+           {24322, 14146, 8256, 0, 0},  {28950, 22425, 16794, 0, 0},
+           {31287, 28651, 25972, 0, 0}, {10119, 1466, 578, 0, 0},
+           {17939, 5641, 2319, 0, 0},   {24455, 15066, 9464, 0, 0},
+           {29746, 24467, 19982, 0, 0}, {31232, 28356, 25584, 0, 0},
+           {10414, 2994, 1396, 0, 0},   {18045, 7296, 3554, 0, 0},
+           {26095, 19023, 14106, 0, 0}, {30700, 27002, 23446, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{26466, 16324, 11007, 0, 0}, {9728, 1230, 293, 0, 0},
+           {17572, 4316, 1272, 0, 0},   {22748, 9822, 4254, 0, 0},
+           {26235, 15906, 9267, 0, 0},  {29230, 22952, 17692, 0, 0},
+           {8324, 893, 243, 0, 0},      {16887, 3844, 1133, 0, 0},
+           {22846, 9895, 4302, 0, 0},   {26241, 15802, 9077, 0, 0},
+           {28654, 21465, 15548, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {12567, 1998, 559, 0, 0},
+           {18014, 4697, 1510, 0, 0},   {24390, 12582, 6251, 0, 0},
+           {26852, 17469, 10790, 0, 0}, {28500, 21185, 14867, 0, 0},
+           {8407, 743, 187, 0, 0},      {14095, 2663, 825, 0, 0},
+           {22572, 10524, 5192, 0, 0},  {27273, 18419, 12351, 0, 0},
+           {30092, 25353, 21270, 0, 0}, {8090, 810, 183, 0, 0},
+           {14139, 2862, 937, 0, 0},    {23404, 12044, 6453, 0, 0},
+           {28127, 20450, 14674, 0, 0}, {30010, 25381, 21189, 0, 0},
+           {7335, 926, 299, 0, 0},      {13973, 3479, 1357, 0, 0},
+           {25124, 15184, 9176, 0, 0},  {29360, 23754, 17721, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28232, 22696, 18767, 0, 0}, {7309, 1352, 562, 0, 0},
+           {16163, 4720, 1950, 0, 0},   {21760, 9911, 5049, 0, 0},
+           {25853, 16500, 10453, 0, 0}, {30143, 25956, 22231, 0, 0},
+           {8511, 980, 269, 0, 0},      {15888, 3314, 889, 0, 0},
+           {20810, 7714, 2990, 0, 0},   {24852, 14050, 7684, 0, 0},
+           {29385, 23991, 19322, 0, 0}, {10048, 1165, 375, 0, 0},
+           {17808, 4643, 1433, 0, 0},   {23037, 10558, 4840, 0, 0},
+           {26464, 16936, 10491, 0, 0}, {29858, 24950, 20602, 0, 0},
+           {12393, 2141, 637, 0, 0},    {18864, 5484, 1881, 0, 0},
+           {23400, 11210, 5624, 0, 0},  {26831, 17802, 11649, 0, 0},
+           {30101, 25543, 21449, 0, 0}, {8798, 1298, 390, 0, 0},
+           {15595, 3034, 750, 0, 0},    {19973, 7327, 2803, 0, 0},
+           {23787, 13088, 6875, 0, 0},  {28040, 21396, 15866, 0, 0},
+           {8481, 971, 329, 0, 0},      {16065, 3623, 1072, 0, 0},
+           {21935, 9214, 4043, 0, 0},   {26300, 16202, 9711, 0, 0},
+           {30353, 26206, 22490, 0, 0}, {6158, 373, 109, 0, 0},
+           {14178, 2270, 651, 0, 0},    {20348, 7012, 2818, 0, 0},
+           {25129, 14022, 8058, 0, 0},  {29767, 24682, 20421, 0, 0},
+           {7692, 704, 188, 0, 0},      {14822, 2640, 740, 0, 0},
+           {20744, 7783, 3390, 0, 0},   {25251, 14378, 8464, 0, 0},
+           {29525, 23987, 19437, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{26731, 15997, 10811, 0, 0}, {7994, 1064, 342, 0, 0},
+           {15938, 4179, 1712, 0, 0},   {22166, 9940, 5008, 0, 0},
+           {26035, 15939, 9697, 0, 0},  {29518, 23854, 19212, 0, 0},
+           {7186, 548, 100, 0, 0},      {14109, 2426, 545, 0, 0},
+           {20222, 6619, 2253, 0, 0},   {24348, 12317, 5967, 0, 0},
+           {28132, 20348, 14424, 0, 0}, {5187, 406, 129, 0, 0},
+           {13781, 2685, 790, 0, 0},    {21441, 8520, 3684, 0, 0},
+           {25504, 15049, 8648, 0, 0},  {28773, 22000, 16599, 0, 0},
+           {6875, 937, 281, 0, 0},      {16191, 4181, 1389, 0, 0},
+           {22579, 10020, 4586, 0, 0},  {25936, 15674, 9212, 0, 0},
+           {29060, 22658, 17434, 0, 0}, {6864, 486, 112, 0, 0},
+           {13047, 1976, 492, 0, 0},    {19949, 6525, 2357, 0, 0},
+           {24196, 12154, 5877, 0, 0},  {27404, 18709, 12301, 0, 0},
+           {6188, 330, 91, 0, 0},       {11916, 1543, 428, 0, 0},
+           {20333, 7068, 2801, 0, 0},   {24077, 11943, 5792, 0, 0},
+           {28322, 20559, 15499, 0, 0}, {5418, 339, 72, 0, 0},
+           {11396, 1791, 496, 0, 0},    {20095, 7498, 2915, 0, 0},
+           {23560, 11843, 6128, 0, 0},  {27750, 19417, 14036, 0, 0},
+           {5417, 289, 55, 0, 0},       {11370, 1559, 381, 0, 0},
+           {20606, 7721, 2926, 0, 0},   {24872, 14077, 7449, 0, 0},
+           {28098, 19886, 13887, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{27281, 22308, 19060, 0, 0}, {11171, 4465, 2094, 0, 0},
+           {21731, 10815, 6292, 0, 0},  {24621, 14806, 9816, 0, 0},
+           {27526, 19707, 14236, 0, 0}, {30879, 27560, 24586, 0, 0},
+           {5994, 635, 178, 0, 0},      {14924, 3204, 1001, 0, 0},
+           {21078, 8330, 3597, 0, 0},   {25226, 14553, 8309, 0, 0},
+           {29775, 24718, 20449, 0, 0}, {4745, 440, 177, 0, 0},
+           {14117, 2642, 814, 0, 0},    {20604, 7622, 3179, 0, 0},
+           {25006, 14238, 7997, 0, 0},  {29276, 23585, 18848, 0, 0},
+           {5177, 760, 277, 0, 0},      {15619, 3915, 1258, 0, 0},
+           {21283, 8765, 3908, 0, 0},   {25071, 14682, 8558, 0, 0},
+           {29693, 24769, 20550, 0, 0}, {4500, 286, 114, 0, 0},
+           {13137, 1717, 364, 0, 0},    {18908, 5508, 1748, 0, 0},
+           {23163, 11155, 5174, 0, 0},  {27892, 20606, 14860, 0, 0},
+           {5520, 452, 192, 0, 0},      {13813, 2311, 693, 0, 0},
+           {20944, 8771, 3973, 0, 0},   {25422, 14572, 8121, 0, 0},
+           {29365, 23521, 18657, 0, 0}, {3057, 113, 33, 0, 0},
+           {11599, 1374, 351, 0, 0},    {19281, 5570, 1811, 0, 0},
+           {23940, 11085, 5154, 0, 0},  {28498, 21317, 15730, 0, 0},
+           {4060, 190, 37, 0, 0},       {12648, 1527, 286, 0, 0},
+           {19076, 5218, 1447, 0, 0},   {23350, 10254, 4329, 0, 0},
+           {27769, 19485, 13306, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{27095, 18466, 13057, 0, 0}, {6517, 2067, 934, 0, 0},
+           {19986, 8985, 4965, 0, 0},   {23641, 12111, 6960, 0, 0},
+           {26400, 16560, 11306, 0, 0}, {30303, 25591, 21946, 0, 0},
+           {2807, 205, 49, 0, 0},       {14450, 2877, 819, 0, 0},
+           {21407, 8254, 3411, 0, 0},   {24868, 13165, 7161, 0, 0},
+           {28766, 22178, 17222, 0, 0}, {3131, 458, 173, 0, 0},
+           {14472, 2855, 959, 0, 0},    {22624, 11253, 5897, 0, 0},
+           {27410, 18446, 12374, 0, 0}, {29701, 24406, 19422, 0, 0},
+           {4116, 298, 92, 0, 0},       {15230, 1997, 559, 0, 0},
+           {18844, 5886, 2274, 0, 0},   {22272, 9931, 4899, 0, 0},
+           {25532, 16372, 11147, 0, 0}, {2025, 81, 22, 0, 0},
+           {9762, 1092, 279, 0, 0},     {18274, 4940, 1648, 0, 0},
+           {22594, 9967, 4416, 0, 0},   {26526, 17487, 11725, 0, 0},
+           {6951, 525, 48, 0, 0},       {14150, 1401, 443, 0, 0},
+           {18771, 4450, 890, 0, 0},    {20513, 6234, 1385, 0, 0},
+           {23207, 11180, 4318, 0, 0},  {4580, 133, 44, 0, 0},
+           {10708, 403, 40, 0, 0},      {14666, 2078, 240, 0, 0},
+           {18572, 3904, 769, 0, 0},    {20506, 6976, 1903, 0, 0},
+           {8592, 659, 140, 0, 0},      {14488, 3087, 805, 0, 0},
+           {22563, 9065, 3104, 0, 0},   {24879, 12743, 5092, 0, 0},
+           {26708, 16025, 8798, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{27627, 25672, 24508, 0, 0}, {5582, 3746, 2979, 0, 0},
+           {26100, 20200, 17086, 0, 0}, {30596, 26587, 24130, 0, 0},
+           {31642, 29389, 28237, 0, 0}, {32325, 31407, 30514, 0, 0},
+           {6685, 1615, 332, 0, 0},     {19282, 8165, 4285, 0, 0},
+           {26260, 17928, 12858, 0, 0}, {29382, 23968, 19482, 0, 0},
+           {31238, 28446, 25714, 0, 0}, {3129, 688, 220, 0, 0},
+           {16871, 5216, 2478, 0, 0},   {24180, 12721, 7385, 0, 0},
+           {27879, 19429, 13499, 0, 0}, {30528, 25897, 22270, 0, 0},
+           {4603, 571, 251, 0, 0},      {12033, 2341, 1200, 0, 0},
+           {18443, 8097, 5076, 0, 0},   {27649, 20214, 14963, 0, 0},
+           {30958, 27327, 24507, 0, 0}, {1556, 44, 20, 0, 0},
+           {9416, 1002, 223, 0, 0},     {18099, 5198, 1709, 0, 0},
+           {24276, 11874, 5496, 0, 0},  {29124, 22574, 17564, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{30307, 25755, 23397, 0, 0}, {8019, 3168, 1782, 0, 0},
+           {23302, 13731, 10351, 0, 0}, {29184, 23488, 18368, 0, 0},
+           {31263, 28839, 27335, 0, 0}, {32091, 31268, 30032, 0, 0},
+           {8781, 2066, 651, 0, 0},     {19214, 8197, 3505, 0, 0},
+           {26557, 18212, 11613, 0, 0}, {29633, 21796, 17143, 0, 0},
+           {30333, 25641, 21341, 0, 0}, {1468, 236, 218, 0, 0},
+           {18011, 2403, 814, 0, 0},    {28363, 21156, 14215, 0, 0},
+           {32188, 28636, 25446, 0, 0}, {31073, 22599, 18644, 0, 0},
+           {2760, 486, 177, 0, 0},      {13524, 2660, 1020, 0, 0},
+           {21588, 8610, 3213, 0, 0},   {27118, 17796, 13559, 0, 0},
+           {30654, 27659, 24312, 0, 0}, {912, 52, 20, 0, 0},
+           {9756, 1104, 196, 0, 0},     {19074, 6112, 2132, 0, 0},
+           {24626, 13260, 6675, 0, 0},  {28515, 21813, 16044, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{32167, 31785, 31457, 0, 0}, {14043, 9362, 4681, 0, 0},
+           {27307, 24576, 21845, 0, 0}, {28987, 17644, 11343, 0, 0},
+           {30181, 25007, 20696, 0, 0}, {32662, 32310, 31958, 0, 0},
+           {10486, 3058, 874, 0, 0},    {24260, 11842, 6784, 0, 0},
+           {29042, 20055, 14685, 0, 0}, {31148, 25656, 21875, 0, 0},
+           {32039, 30532, 29273, 0, 0}, {2605, 294, 84, 0, 0},
+           {14464, 2304, 768, 0, 0},    {21325, 6242, 3121, 0, 0},
+           {26761, 17476, 11469, 0, 0}, {30534, 26065, 23831, 0, 0},
+           {1814, 591, 197, 0, 0},      {15405, 3206, 1692, 0, 0},
+           {23082, 10304, 5358, 0, 0},  {24576, 16384, 11378, 0, 0},
+           {31013, 24722, 21504, 0, 0}, {1600, 34, 20, 0, 0},
+           {10282, 1327, 297, 0, 0},    {19935, 7141, 3030, 0, 0},
+           {25788, 15389, 9646, 0, 0},  {29657, 23881, 19289, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{26727, 20914, 16841, 0, 0}, {12442, 1863, 517, 0, 0},
+           {18604, 5937, 2043, 0, 0},   {23008, 12121, 6183, 0, 0},
+           {26352, 17815, 11549, 0, 0}, {29802, 25617, 21877, 0, 0},
+           {9201, 1394, 514, 0, 0},     {17790, 5352, 1822, 0, 0},
+           {23334, 12543, 6514, 0, 0},  {26110, 18210, 12233, 0, 0},
+           {28852, 24091, 19779, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {14680, 3223, 1181, 0, 0},
+           {19706, 6925, 2695, 0, 0},   {23828, 15941, 10517, 0, 0},
+           {25114, 19548, 14795, 0, 0}, {27035, 22452, 18312, 0, 0},
+           {9889, 1380, 654, 0, 0},     {17553, 4775, 1813, 0, 0},
+           {23371, 13323, 7790, 0, 0},  {29326, 22955, 17424, 0, 0},
+           {31400, 28832, 26236, 0, 0}, {7274, 735, 362, 0, 0},
+           {15996, 4805, 2050, 0, 0},   {23349, 14603, 9508, 0, 0},
+           {30091, 25267, 20971, 0, 0}, {31252, 28424, 25598, 0, 0},
+           {6212, 1314, 667, 0, 0},     {15640, 5733, 2660, 0, 0},
+           {24444, 17424, 12519, 0, 0}, {30865, 27072, 23299, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24313, 13765, 8400, 0, 0},  {9205, 747, 164, 0, 0},
+           {16531, 3322, 833, 0, 0},    {22044, 8769, 3410, 0, 0},
+           {26043, 15240, 8352, 0, 0},  {28841, 21841, 15943, 0, 0},
+           {6455, 480, 134, 0, 0},      {15338, 2673, 673, 0, 0},
+           {21652, 8162, 3089, 0, 0},   {25573, 14384, 7499, 0, 0},
+           {28042, 19916, 13453, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {9946, 1120, 285, 0, 0},
+           {16044, 3135, 839, 0, 0},    {22507, 9735, 4043, 0, 0},
+           {25739, 14928, 8240, 0, 0},  {27901, 18882, 11266, 0, 0},
+           {7470, 876, 277, 0, 0},      {14959, 3438, 1256, 0, 0},
+           {23100, 11439, 6189, 0, 0},  {27994, 19812, 13792, 0, 0},
+           {30446, 25738, 21228, 0, 0}, {7296, 848, 225, 0, 0},
+           {14811, 3381, 1136, 0, 0},   {23572, 12175, 6368, 0, 0},
+           {28088, 20063, 13566, 0, 0}, {29851, 24312, 19332, 0, 0},
+           {6297, 709, 194, 0, 0},      {14310, 2985, 859, 0, 0},
+           {24368, 13304, 6812, 0, 0},  {28956, 21795, 15562, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{25989, 19025, 15090, 0, 0}, {7962, 971, 311, 0, 0},
+           {15152, 3721, 1396, 0, 0},   {21705, 9593, 4765, 0, 0},
+           {26247, 16658, 10444, 0, 0}, {30004, 25264, 21114, 0, 0},
+           {7502, 401, 131, 0, 0},      {13714, 2215, 593, 0, 0},
+           {20629, 7556, 2961, 0, 0},   {25457, 14606, 8064, 0, 0},
+           {29371, 23604, 18694, 0, 0}, {6780, 560, 246, 0, 0},
+           {16515, 3856, 1242, 0, 0},   {23617, 11381, 5396, 0, 0},
+           {27080, 17853, 11272, 0, 0}, {30051, 25141, 20764, 0, 0},
+           {9624, 913, 325, 0, 0},      {16698, 4277, 1443, 0, 0},
+           {24066, 12301, 6251, 0, 0},  {27525, 18812, 12401, 0, 0},
+           {30147, 25433, 21201, 0, 0}, {6132, 428, 138, 0, 0},
+           {12778, 1718, 427, 0, 0},    {19525, 6663, 2453, 0, 0},
+           {24180, 13247, 6850, 0, 0},  {28051, 21183, 15464, 0, 0},
+           {6924, 476, 186, 0, 0},      {13678, 2133, 671, 0, 0},
+           {20805, 8222, 3829, 0, 0},   {26550, 16681, 10414, 0, 0},
+           {30428, 26160, 22342, 0, 0}, {4722, 192, 74, 0, 0},
+           {11590, 1455, 472, 0, 0},    {19282, 6584, 2898, 0, 0},
+           {25619, 14897, 9045, 0, 0},  {29935, 24810, 20509, 0, 0},
+           {5058, 240, 82, 0, 0},       {12094, 1692, 500, 0, 0},
+           {20355, 7813, 3525, 0, 0},   {26092, 15841, 9671, 0, 0},
+           {29802, 24435, 19849, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24129, 13429, 8339, 0, 0},  {8364, 931, 243, 0, 0},
+           {15771, 3343, 984, 0, 0},    {21515, 8534, 3619, 0, 0},
+           {26017, 15374, 8740, 0, 0},  {29278, 22938, 17577, 0, 0},
+           {6485, 297, 54, 0, 0},       {13169, 1600, 326, 0, 0},
+           {19622, 5814, 1875, 0, 0},   {24554, 12180, 5878, 0, 0},
+           {28069, 19687, 13468, 0, 0}, {4556, 310, 99, 0, 0},
+           {14174, 2452, 668, 0, 0},    {21549, 8360, 3534, 0, 0},
+           {25903, 15112, 8619, 0, 0},  {29090, 22406, 16762, 0, 0},
+           {6943, 632, 152, 0, 0},      {15455, 2915, 747, 0, 0},
+           {21571, 8297, 3296, 0, 0},   {25821, 14987, 8363, 0, 0},
+           {29000, 22108, 16507, 0, 0}, {5416, 268, 62, 0, 0},
+           {11918, 1300, 299, 0, 0},    {18747, 5061, 1635, 0, 0},
+           {23804, 11020, 4930, 0, 0},  {27331, 18103, 11581, 0, 0},
+           {6464, 276, 70, 0, 0},       {12359, 1388, 383, 0, 0},
+           {19086, 5546, 2136, 0, 0},   {23794, 11532, 6083, 0, 0},
+           {28534, 21103, 15834, 0, 0}, {6495, 411, 57, 0, 0},
+           {12096, 1526, 327, 0, 0},    {18596, 5514, 1866, 0, 0},
+           {22898, 10870, 5493, 0, 0},  {27604, 19262, 13498, 0, 0},
+           {6043, 309, 40, 0, 0},       {11777, 1326, 241, 0, 0},
+           {19697, 6334, 1957, 0, 0},   {24584, 12678, 6026, 0, 0},
+           {27965, 19513, 12873, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{25213, 17826, 14267, 0, 0}, {8358, 1590, 481, 0, 0},
+           {18374, 6030, 2515, 0, 0},   {24355, 13214, 7573, 0, 0},
+           {28002, 19844, 13983, 0, 0}, {30739, 26962, 23561, 0, 0},
+           {5992, 404, 105, 0, 0},      {14036, 2801, 837, 0, 0},
+           {21763, 8982, 3916, 0, 0},   {26302, 15859, 9258, 0, 0},
+           {29724, 24130, 19349, 0, 0}, {3560, 186, 64, 0, 0},
+           {12700, 1911, 560, 0, 0},    {20765, 7683, 3173, 0, 0},
+           {25821, 15018, 8579, 0, 0},  {29523, 23665, 18761, 0, 0},
+           {5409, 303, 99, 0, 0},       {13347, 2154, 594, 0, 0},
+           {20853, 7758, 3189, 0, 0},   {25818, 15092, 8694, 0, 0},
+           {29761, 24295, 19672, 0, 0}, {3766, 92, 33, 0, 0},
+           {10666, 919, 192, 0, 0},     {18360, 4759, 1363, 0, 0},
+           {23741, 11089, 4837, 0, 0},  {28074, 20090, 14020, 0, 0},
+           {4552, 240, 86, 0, 0},       {11919, 1504, 450, 0, 0},
+           {20012, 6953, 3017, 0, 0},   {25203, 13967, 7845, 0, 0},
+           {29259, 23235, 18291, 0, 0}, {2635, 81, 29, 0, 0},
+           {9705, 858, 253, 0, 0},      {18180, 4717, 1636, 0, 0},
+           {23683, 11119, 5311, 0, 0},  {28507, 21114, 15504, 0, 0},
+           {3250, 77, 20, 0, 0},        {10317, 809, 155, 0, 0},
+           {17904, 4046, 1068, 0, 0},   {23073, 9804, 4052, 0, 0},
+           {27836, 19410, 13266, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{26303, 15810, 11080, 0, 0}, {7569, 1254, 408, 0, 0},
+           {17994, 5619, 2161, 0, 0},   {23511, 11330, 5796, 0, 0},
+           {27045, 17585, 10886, 0, 0}, {29618, 23889, 19037, 0, 0},
+           {5779, 506, 86, 0, 0},       {15372, 2831, 683, 0, 0},
+           {21381, 7867, 2984, 0, 0},   {25479, 13947, 7220, 0, 0},
+           {29034, 22191, 16682, 0, 0}, {3040, 267, 73, 0, 0},
+           {15337, 3067, 865, 0, 0},    {22847, 9942, 4468, 0, 0},
+           {26872, 17334, 10700, 0, 0}, {29338, 23122, 18011, 0, 0},
+           {4154, 257, 63, 0, 0},       {13404, 2130, 505, 0, 0},
+           {19639, 6514, 2366, 0, 0},   {24014, 12284, 6328, 0, 0},
+           {28390, 21161, 15658, 0, 0}, {2476, 97, 24, 0, 0},
+           {10988, 1165, 267, 0, 0},    {18454, 4939, 1477, 0, 0},
+           {23157, 10441, 4505, 0, 0},  {27878, 19681, 13703, 0, 0},
+           {6906, 201, 35, 0, 0},       {11974, 718, 201, 0, 0},
+           {15525, 2143, 514, 0, 0},    {19485, 5140, 1294, 0, 0},
+           {23099, 10236, 3850, 0, 0},  {5333, 71, 20, 0, 0},
+           {7846, 378, 54, 0, 0},       {11319, 1264, 232, 0, 0},
+           {16376, 3039, 936, 0, 0},    {21076, 7884, 3692, 0, 0},
+           {8575, 478, 33, 0, 0},       {13859, 1664, 205, 0, 0},
+           {20532, 5927, 1365, 0, 0},   {24597, 10928, 3686, 0, 0},
+           {25544, 15488, 7493, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{29690, 25929, 22878, 0, 0}, {18931, 12318, 8289, 0, 0},
+           {26854, 18546, 13440, 0, 0}, {28902, 22501, 18006, 0, 0},
+           {30156, 25560, 21726, 0, 0}, {31701, 29777, 27992, 0, 0},
+           {6951, 1122, 239, 0, 0},     {19060, 6430, 2383, 0, 0},
+           {25440, 14183, 7898, 0, 0},  {28077, 19688, 13492, 0, 0},
+           {30943, 27515, 24416, 0, 0}, {3382, 453, 144, 0, 0},
+           {15608, 3767, 1408, 0, 0},   {23166, 10906, 5372, 0, 0},
+           {26853, 16996, 10620, 0, 0}, {29982, 24989, 20721, 0, 0},
+           {3522, 318, 105, 0, 0},      {14072, 2839, 950, 0, 0},
+           {22258, 9399, 4208, 0, 0},   {26539, 16269, 9643, 0, 0},
+           {30160, 25320, 21063, 0, 0}, {2015, 58, 20, 0, 0},
+           {11130, 1281, 265, 0, 0},    {19831, 5914, 1898, 0, 0},
+           {24586, 12172, 5798, 0, 0},  {29131, 22499, 17271, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{27524, 20618, 15862, 0, 0}, {12282, 5910, 3067, 0, 0},
+           {25012, 14451, 9033, 0, 0},  {29316, 23512, 19622, 0, 0},
+           {30748, 27562, 24539, 0, 0}, {30967, 27775, 24865, 0, 0},
+           {5717, 910, 237, 0, 0},      {16780, 5237, 2149, 0, 0},
+           {23580, 11284, 6049, 0, 0},  {26495, 15582, 8968, 0, 0},
+           {29660, 23413, 18004, 0, 0}, {1692, 248, 88, 0, 0},
+           {14649, 2731, 918, 0, 0},    {22524, 9799, 5296, 0, 0},
+           {28076, 18691, 13495, 0, 0}, {29074, 21091, 15212, 0, 0},
+           {2708, 187, 48, 0, 0},       {11757, 1993, 648, 0, 0},
+           {20837, 7948, 3479, 0, 0},   {25649, 15106, 8412, 0, 0},
+           {28935, 22062, 16464, 0, 0}, {814, 37, 20, 0, 0},
+           {8855, 1044, 279, 0, 0},     {17248, 4708, 1482, 0, 0},
+           {21251, 9760, 4197, 0, 0},   {26575, 18260, 12139, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{31733, 29961, 28612, 0, 0}, {19606, 14630, 11829, 0, 0},
+           {30072, 26135, 24013, 0, 0}, {31395, 28607, 25915, 0, 0},
+           {31669, 30022, 28052, 0, 0}, {32428, 31747, 31169, 0, 0},
+           {9942, 2349, 633, 0, 0},     {22373, 11006, 5826, 0, 0},
+           {28042, 20361, 15407, 0, 0}, {30321, 25688, 22175, 0, 0},
+           {31541, 29051, 26757, 0, 0}, {4612, 1344, 834, 0, 0},
+           {15853, 5014, 2395, 0, 0},   {23620, 11778, 6337, 0, 0},
+           {26818, 17253, 11620, 0, 0}, {30276, 25441, 21242, 0, 0},
+           {2166, 291, 98, 0, 0},       {12742, 2813, 1200, 0, 0},
+           {21548, 9140, 4663, 0, 0},   {26116, 15749, 9795, 0, 0},
+           {29704, 24232, 19725, 0, 0}, {999, 44, 20, 0, 0},
+           {10538, 1881, 395, 0, 0},    {20534, 7689, 3037, 0, 0},
+           {25442, 13952, 7415, 0, 0},  {28835, 21861, 16152, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{23872, 16541, 12138, 0, 0}, {9139, 986, 241, 0, 0},
+           {17595, 5013, 1447, 0, 0},   {22610, 11535, 5386, 0, 0},
+           {26348, 17911, 11210, 0, 0}, {29499, 24613, 20122, 0, 0},
+           {7933, 759, 272, 0, 0},      {16259, 4347, 1189, 0, 0},
+           {21811, 11254, 5350, 0, 0},  {24887, 16838, 10672, 0, 0},
+           {27380, 21808, 16850, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {12023, 1995, 675, 0, 0},
+           {17568, 5547, 1907, 0, 0},   {19736, 11895, 7101, 0, 0},
+           {20483, 14105, 9274, 0, 0},  {21205, 15287, 11279, 0, 0},
+           {6508, 786, 448, 0, 0},      {17371, 4685, 1668, 0, 0},
+           {23026, 13551, 7944, 0, 0},  {29507, 23139, 17406, 0, 0},
+           {31288, 28446, 25269, 0, 0}, {5169, 512, 308, 0, 0},
+           {15911, 5109, 1994, 0, 0},   {23217, 14478, 9020, 0, 0},
+           {29716, 23835, 18665, 0, 0}, {30747, 26858, 22981, 0, 0},
+           {3763, 753, 376, 0, 0},      {15091, 5074, 1905, 0, 0},
+           {23564, 15412, 9549, 0, 0},  {30365, 25252, 19954, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{21960, 10712, 5872, 0, 0},  {7029, 455, 92, 0, 0},
+           {15480, 2565, 547, 0, 0},    {21409, 7890, 2872, 0, 0},
+           {25819, 15001, 7875, 0, 0},  {28481, 20972, 14697, 0, 0},
+           {4888, 247, 63, 0, 0},       {13730, 1764, 354, 0, 0},
+           {20204, 6423, 2000, 0, 0},   {24499, 12821, 5989, 0, 0},
+           {27094, 18111, 11094, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {7026, 449, 97, 0, 0},
+           {13211, 1604, 314, 0, 0},    {19387, 6387, 2013, 0, 0},
+           {22667, 11302, 6046, 0, 0},  {23559, 13118, 5943, 0, 0},
+           {5661, 851, 336, 0, 0},      {14712, 3875, 1565, 0, 0},
+           {22568, 11334, 6004, 0, 0},  {28108, 19855, 13266, 0, 0},
+           {30400, 25838, 20264, 0, 0}, {5808, 610, 155, 0, 0},
+           {14140, 2763, 737, 0, 0},    {22535, 10326, 4536, 0, 0},
+           {27297, 18138, 11252, 0, 0}, {29533, 22001, 15659, 0, 0},
+           {5072, 328, 76, 0, 0},       {12736, 1601, 330, 0, 0},
+           {24068, 11427, 4326, 0, 0},  {27106, 17937, 10973, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{23064, 15474, 11636, 0, 0}, {6006, 490, 135, 0, 0},
+           {14386, 3148, 949, 0, 0},    {21877, 9293, 4045, 0, 0},
+           {26410, 16185, 9459, 0, 0},  {29520, 23650, 18627, 0, 0},
+           {5564, 195, 69, 0, 0},       {12950, 1944, 439, 0, 0},
+           {20996, 7648, 2727, 0, 0},   {25773, 14735, 7729, 0, 0},
+           {29016, 22326, 16670, 0, 0}, {5546, 512, 209, 0, 0},
+           {17412, 4369, 1293, 0, 0},   {23947, 12133, 5711, 0, 0},
+           {27257, 18364, 11529, 0, 0}, {29833, 24546, 19717, 0, 0},
+           {7893, 648, 239, 0, 0},      {17535, 4503, 1323, 0, 0},
+           {24163, 12198, 5836, 0, 0},  {27337, 18355, 11572, 0, 0},
+           {29774, 24427, 19545, 0, 0}, {4567, 164, 68, 0, 0},
+           {11727, 1322, 312, 0, 0},    {19547, 6555, 2293, 0, 0},
+           {24513, 13383, 6731, 0, 0},  {27838, 20183, 13938, 0, 0},
+           {4000, 320, 141, 0, 0},      {13063, 2207, 747, 0, 0},
+           {21196, 9179, 4548, 0, 0},   {27236, 17734, 11322, 0, 0},
+           {30308, 25618, 21312, 0, 0}, {2894, 149, 69, 0, 0},
+           {11147, 1697, 567, 0, 0},    {20257, 8021, 3776, 0, 0},
+           {26487, 16373, 10020, 0, 0}, {29522, 23490, 18271, 0, 0},
+           {3053, 143, 56, 0, 0},       {11810, 1757, 485, 0, 0},
+           {21535, 9097, 3962, 0, 0},   {26756, 16640, 9900, 0, 0},
+           {29341, 22917, 17354, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{21752, 10657, 5974, 0, 0},  {6822, 411, 91, 0, 0},
+           {14878, 2316, 516, 0, 0},    {21090, 7626, 2952, 0, 0},
+           {26048, 15234, 8184, 0, 0},  {28538, 21103, 14948, 0, 0},
+           {4368, 145, 21, 0, 0},       {11604, 1100, 193, 0, 0},
+           {19196, 5380, 1586, 0, 0},   {24534, 12018, 5410, 0, 0},
+           {27703, 18713, 11871, 0, 0}, {3787, 221, 63, 0, 0},
+           {14087, 2225, 529, 0, 0},    {21849, 8693, 3482, 0, 0},
+           {26337, 15569, 8691, 0, 0},  {28949, 22304, 16150, 0, 0},
+           {5898, 301, 75, 0, 0},       {13727, 1937, 421, 0, 0},
+           {20974, 7557, 2752, 0, 0},   {25880, 14749, 7798, 0, 0},
+           {28398, 20405, 13776, 0, 0}, {3190, 98, 24, 0, 0},
+           {9609, 761, 155, 0, 0},      {17453, 4099, 1092, 0, 0},
+           {23470, 10161, 3986, 0, 0},  {26624, 16855, 9800, 0, 0},
+           {4658, 269, 99, 0, 0},       {11194, 1831, 753, 0, 0},
+           {20009, 7950, 4041, 0, 0},   {26223, 16007, 9726, 0, 0},
+           {29119, 22171, 15935, 0, 0}, {4605, 216, 40, 0, 0},
+           {10667, 1299, 304, 0, 0},    {19608, 7296, 2625, 0, 0},
+           {25465, 14084, 7300, 0, 0},  {27527, 18793, 11813, 0, 0},
+           {4368, 137, 24, 0, 0},       {10664, 975, 165, 0, 0},
+           {19211, 6197, 1922, 0, 0},   {25019, 12907, 6093, 0, 0},
+           {27895, 18738, 11534, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{22968, 15133, 11695, 0, 0}, {6615, 883, 241, 0, 0},
+           {17730, 4916, 1762, 0, 0},   {24050, 12204, 6282, 0, 0},
+           {27640, 18692, 12254, 0, 0}, {30132, 25202, 20843, 0, 0},
+           {5217, 264, 67, 0, 0},       {14458, 2714, 668, 0, 0},
+           {22557, 9348, 3686, 0, 0},   {26546, 15892, 8852, 0, 0},
+           {29306, 22814, 17270, 0, 0}, {2777, 135, 47, 0, 0},
+           {12885, 2017, 567, 0, 0},    {21627, 8584, 3483, 0, 0},
+           {26348, 15828, 8994, 0, 0},  {29376, 23015, 17650, 0, 0},
+           {4303, 152, 56, 0, 0},       {12918, 2066, 524, 0, 0},
+           {21785, 8744, 3545, 0, 0},   {26474, 15998, 9186, 0, 0},
+           {29524, 23485, 18259, 0, 0}, {2745, 51, 20, 0, 0},
+           {9828, 736, 142, 0, 0},      {18486, 4840, 1295, 0, 0},
+           {24206, 11441, 4854, 0, 0},  {27922, 19375, 12849, 0, 0},
+           {2787, 178, 73, 0, 0},       {12303, 1805, 602, 0, 0},
+           {21289, 9189, 4573, 0, 0},   {26852, 17120, 10695, 0, 0},
+           {29737, 24163, 19370, 0, 0}, {1622, 77, 29, 0, 0},
+           {9662, 1044, 324, 0, 0},     {18985, 6030, 2329, 0, 0},
+           {24916, 13300, 6961, 0, 0},  {28908, 21644, 15915, 0, 0},
+           {1754, 44, 20, 0, 0},        {9139, 659, 140, 0, 0},
+           {18021, 4653, 1365, 0, 0},   {24223, 11526, 5290, 0, 0},
+           {28194, 19987, 13701, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{23583, 13074, 8080, 0, 0},  {6687, 783, 147, 0, 0},
+           {16753, 3768, 981, 0, 0},    {22226, 9078, 3562, 0, 0},
+           {26036, 14823, 8091, 0, 0},  {28852, 21729, 16046, 0, 0},
+           {4544, 202, 24, 0, 0},       {13668, 1630, 283, 0, 0},
+           {20240, 6148, 1889, 0, 0},   {25027, 12491, 5883, 0, 0},
+           {28202, 19923, 13778, 0, 0}, {2835, 175, 50, 0, 0},
+           {15098, 2435, 613, 0, 0},    {22383, 9168, 3859, 0, 0},
+           {26525, 16532, 10361, 0, 0}, {28792, 22379, 16751, 0, 0},
+           {4391, 207, 30, 0, 0},       {13402, 1593, 286, 0, 0},
+           {19441, 5593, 1674, 0, 0},   {24510, 11999, 5625, 0, 0},
+           {28065, 19570, 13241, 0, 0}, {1682, 62, 20, 0, 0},
+           {9915, 866, 185, 0, 0},      {18009, 4582, 1349, 0, 0},
+           {23484, 10386, 4420, 0, 0},  {27183, 17576, 10900, 0, 0},
+           {4477, 116, 22, 0, 0},       {12919, 661, 197, 0, 0},
+           {17934, 5950, 3554, 0, 0},   {22462, 10174, 4096, 0, 0},
+           {26153, 15384, 9384, 0, 0},  {3821, 164, 23, 0, 0},
+           {7143, 479, 122, 0, 0},      {14010, 4096, 1365, 0, 0},
+           {22751, 9338, 4245, 0, 0},   {25906, 17499, 10637, 0, 0},
+           {8835, 259, 29, 0, 0},       {12841, 1273, 137, 0, 0},
+           {20865, 6745, 2147, 0, 0},   {25742, 12674, 5516, 0, 0},
+           {26770, 14662, 8331, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28312, 21494, 17235, 0, 0}, {11549, 3689, 1152, 0, 0},
+           {21595, 8994, 4201, 0, 0},   {25486, 14475, 8505, 0, 0},
+           {27878, 19482, 13653, 0, 0}, {30878, 27260, 24109, 0, 0},
+           {6117, 632, 121, 0, 0},      {18138, 4514, 1313, 0, 0},
+           {24052, 11481, 5373, 0, 0},  {27153, 17437, 10760, 0, 0},
+           {30093, 25068, 20618, 0, 0}, {2814, 242, 78, 0, 0},
+           {16642, 3786, 1135, 0, 0},   {23738, 11407, 5416, 0, 0},
+           {27357, 17975, 11497, 0, 0}, {29825, 24346, 19605, 0, 0},
+           {3229, 167, 38, 0, 0},       {14643, 2383, 567, 0, 0},
+           {22346, 8678, 3300, 0, 0},   {26300, 15281, 8330, 0, 0},
+           {29798, 24115, 19237, 0, 0}, {1856, 53, 20, 0, 0},
+           {12102, 1395, 271, 0, 0},    {20259, 6128, 1851, 0, 0},
+           {24710, 12139, 5478, 0, 0},  {28537, 20762, 14716, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{22566, 12135, 7284, 0, 0},  {5432, 1323, 416, 0, 0},
+           {20348, 8384, 4216, 0, 0},   {25120, 14653, 8912, 0, 0},
+           {27106, 18427, 12866, 0, 0}, {29157, 22440, 17378, 0, 0},
+           {1823, 152, 32, 0, 0},       {14086, 2263, 515, 0, 0},
+           {21255, 7432, 2565, 0, 0},   {25319, 13316, 6620, 0, 0},
+           {28286, 19717, 13882, 0, 0}, {746, 78, 21, 0, 0},
+           {14190, 2267, 622, 0, 0},    {21519, 9400, 4137, 0, 0},
+           {27123, 15810, 10610, 0, 0}, {27759, 21324, 16131, 0, 0},
+           {1411, 58, 20, 0, 0},        {11216, 1274, 264, 0, 0},
+           {18877, 5091, 1428, 0, 0},   {23717, 10670, 4596, 0, 0},
+           {27578, 19391, 13282, 0, 0}, {404, 28, 20, 0, 0},
+           {7929, 861, 217, 0, 0},      {15608, 3989, 1072, 0, 0},
+           {20316, 8631, 3166, 0, 0},   {26603, 17379, 10291, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{30193, 25487, 21691, 0, 0}, {18766, 11902, 7366, 0, 0},
+           {26425, 17712, 13110, 0, 0}, {28294, 20910, 15727, 0, 0},
+           {29903, 24469, 20234, 0, 0}, {31424, 28819, 26377, 0, 0},
+           {8048, 1529, 309, 0, 0},     {20183, 7412, 2800, 0, 0},
+           {25587, 14522, 8324, 0, 0},  {27743, 19101, 12883, 0, 0},
+           {30247, 25464, 21163, 0, 0}, {2860, 516, 184, 0, 0},
+           {15347, 3612, 1193, 0, 0},   {22879, 10580, 4986, 0, 0},
+           {26890, 17121, 10645, 0, 0}, {29954, 24103, 19445, 0, 0},
+           {2585, 200, 55, 0, 0},       {14240, 2573, 719, 0, 0},
+           {21786, 8162, 3111, 0, 0},   {25811, 14603, 7537, 0, 0},
+           {29260, 22650, 17300, 0, 0}, {1007, 32, 20, 0, 0},
+           {11727, 1440, 222, 0, 0},    {20200, 6036, 1602, 0, 0},
+           {24716, 12048, 5035, 0, 0},  {28432, 20576, 14372, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{25706, 16296, 10449, 0, 0}, {8230, 507, 94, 0, 0},
+           {19093, 4727, 989, 0, 0},    {24178, 12094, 5137, 0, 0},
+           {27083, 18093, 10755, 0, 0}, {29113, 22870, 17037, 0, 0},
+           {6275, 350, 110, 0, 0},      {16392, 3426, 678, 0, 0},
+           {22174, 10119, 3798, 0, 0},  {24592, 15598, 8465, 0, 0},
+           {27163, 20074, 13629, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {8880, 866, 226, 0, 0},
+           {14156, 3081, 781, 0, 0},    {16523, 7916, 3519, 0, 0},
+           {17003, 10160, 5209, 0, 0},  {12873, 8069, 5258, 0, 0},
+           {4367, 556, 311, 0, 0},      {17494, 4943, 1788, 0, 0},
+           {23404, 14640, 8436, 0, 0},  {30485, 24575, 17686, 0, 0},
+           {31540, 28796, 24887, 0, 0}, {3313, 299, 148, 0, 0},
+           {14787, 4523, 1380, 0, 0},   {21847, 12670, 6528, 0, 0},
+           {29025, 20939, 14111, 0, 0}, {30394, 23175, 17053, 0, 0},
+           {1700, 302, 133, 0, 0},      {12447, 3196, 797, 0, 0},
+           {21997, 12513, 5649, 0, 0},  {29973, 22358, 15407, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{23448, 10666, 4928, 0, 0},  {5711, 304, 44, 0, 0},
+           {16437, 2500, 459, 0, 0},    {22449, 8833, 3048, 0, 0},
+           {26579, 16320, 8662, 0, 0},  {29179, 21884, 13960, 0, 0},
+           {3742, 144, 20, 0, 0},       {13542, 1261, 181, 0, 0},
+           {20076, 5847, 1565, 0, 0},   {25719, 13236, 5133, 0, 0},
+           {25041, 17099, 9516, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {4712, 143, 20, 0, 0},
+           {10385, 693, 99, 0, 0},      {17351, 5670, 1019, 0, 0},
+           {14641, 6275, 5578, 0, 0},   {27307, 16384, 10923, 0, 0},
+           {4786, 677, 184, 0, 0},      {13723, 2900, 796, 0, 0},
+           {22371, 10502, 4836, 0, 0},  {26778, 19071, 11268, 0, 0},
+           {30976, 25856, 17664, 0, 0}, {4570, 267, 50, 0, 0},
+           {11234, 1247, 199, 0, 0},    {21659, 7551, 2751, 0, 0},
+           {27097, 17644, 6617, 0, 0},  {28087, 18725, 14043, 0, 0},
+           {4080, 188, 27, 0, 0},       {10192, 689, 107, 0, 0},
+           {22141, 10627, 4428, 0, 0},  {23406, 18725, 4681, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{25014, 15820, 10626, 0, 0}, {7098, 438, 77, 0, 0},
+           {17105, 3543, 774, 0, 0},    {22890, 9480, 3610, 0, 0},
+           {26349, 15680, 8432, 0, 0},  {28909, 21765, 15729, 0, 0},
+           {5206, 173, 43, 0, 0},       {15193, 2180, 369, 0, 0},
+           {21949, 7930, 2459, 0, 0},   {25644, 14082, 6852, 0, 0},
+           {28289, 20080, 13428, 0, 0}, {4383, 292, 95, 0, 0},
+           {17462, 3763, 830, 0, 0},    {23831, 11153, 4446, 0, 0},
+           {26786, 17165, 9982, 0, 0},  {29148, 22501, 16632, 0, 0},
+           {5488, 304, 101, 0, 0},      {17161, 3608, 764, 0, 0},
+           {23677, 10633, 4028, 0, 0},  {26536, 16136, 8748, 0, 0},
+           {28721, 21391, 15096, 0, 0}, {3548, 138, 50, 0, 0},
+           {13118, 1548, 306, 0, 0},    {19718, 6456, 1941, 0, 0},
+           {23540, 11898, 5300, 0, 0},  {26622, 17619, 10797, 0, 0},
+           {2599, 287, 145, 0, 0},      {15556, 3457, 1214, 0, 0},
+           {22857, 11457, 5886, 0, 0},  {28281, 19454, 12396, 0, 0},
+           {30198, 24996, 19879, 0, 0}, {1844, 155, 60, 0, 0},
+           {13278, 2562, 661, 0, 0},    {21536, 8770, 3492, 0, 0},
+           {25999, 14813, 7733, 0, 0},  {28370, 20145, 13554, 0, 0},
+           {2159, 141, 46, 0, 0},       {13398, 2186, 481, 0, 0},
+           {22311, 9149, 3359, 0, 0},   {26325, 15131, 7934, 0, 0},
+           {28123, 19532, 12662, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24142, 12497, 6552, 0, 0},  {6061, 362, 57, 0, 0},
+           {15769, 2439, 482, 0, 0},    {21323, 7645, 2482, 0, 0},
+           {26357, 13940, 7167, 0, 0},  {25967, 20310, 12520, 0, 0},
+           {2850, 86, 20, 0, 0},        {12119, 1029, 150, 0, 0},
+           {19889, 4995, 1187, 0, 0},   {24872, 11017, 4524, 0, 0},
+           {27508, 17898, 9070, 0, 0},  {3516, 175, 37, 0, 0},
+           {15696, 2308, 474, 0, 0},    {22115, 8625, 3403, 0, 0},
+           {26232, 15278, 8785, 0, 0},  {27839, 19598, 12683, 0, 0},
+           {4631, 250, 53, 0, 0},       {14597, 1984, 361, 0, 0},
+           {21331, 7332, 2309, 0, 0},   {25516, 14234, 6592, 0, 0},
+           {28642, 19415, 11790, 0, 0}, {1606, 42, 20, 0, 0},
+           {9751, 546, 67, 0, 0},       {17139, 3535, 722, 0, 0},
+           {23381, 10147, 3288, 0, 0},  {25846, 15152, 7758, 0, 0},
+           {3930, 503, 154, 0, 0},      {13067, 2562, 848, 0, 0},
+           {21554, 10358, 4835, 0, 0},  {27448, 18591, 9734, 0, 0},
+           {27719, 19887, 14941, 0, 0}, {5284, 297, 34, 0, 0},
+           {11692, 1242, 207, 0, 0},    {20061, 6465, 1557, 0, 0},
+           {24599, 11046, 4549, 0, 0},  {26723, 13362, 5726, 0, 0},
+           {5015, 196, 23, 0, 0},       {11936, 890, 115, 0, 0},
+           {19518, 5412, 1094, 0, 0},   {25050, 11260, 2910, 0, 0},
+           {25559, 14418, 7209, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{24892, 15867, 11027, 0, 0}, {8767, 870, 143, 0, 0},
+           {18239, 4809, 1317, 0, 0},   {24495, 11950, 5510, 0, 0},
+           {27490, 18095, 11258, 0, 0}, {29785, 23925, 18729, 0, 0},
+           {4752, 194, 36, 0, 0},       {15297, 2462, 467, 0, 0},
+           {22544, 8705, 3040, 0, 0},   {26166, 14814, 7716, 0, 0},
+           {28766, 21183, 15009, 0, 0}, {2578, 134, 29, 0, 0},
+           {15271, 2486, 498, 0, 0},    {22539, 9039, 3230, 0, 0},
+           {26424, 15557, 8328, 0, 0},  {28919, 21579, 15660, 0, 0},
+           {4198, 185, 42, 0, 0},       {15247, 2607, 530, 0, 0},
+           {22615, 9203, 3390, 0, 0},   {26313, 15427, 8325, 0, 0},
+           {28861, 21726, 15744, 0, 0}, {2079, 53, 20, 0, 0},
+           {11222, 928, 158, 0, 0},     {19221, 5187, 1309, 0, 0},
+           {23856, 11011, 4459, 0, 0},  {27220, 17688, 10722, 0, 0},
+           {1985, 228, 83, 0, 0},       {15228, 3240, 1100, 0, 0},
+           {22608, 11300, 5985, 0, 0},  {28044, 19375, 12714, 0, 0},
+           {30066, 24594, 19666, 0, 0}, {1120, 82, 26, 0, 0},
+           {11814, 1674, 431, 0, 0},    {20348, 7070, 2589, 0, 0},
+           {25464, 13448, 6520, 0, 0},  {28402, 20507, 13904, 0, 0},
+           {1187, 45, 20, 0, 0},        {11395, 1182, 243, 0, 0},
+           {20024, 6143, 1883, 0, 0},   {25337, 12446, 5818, 0, 0},
+           {28076, 19445, 12657, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24935, 14399, 8673, 0, 0},  {6118, 495, 66, 0, 0},
+           {16397, 2807, 577, 0, 0},    {21713, 8686, 3139, 0, 0},
+           {25876, 14124, 7368, 0, 0},  {27762, 19711, 13528, 0, 0},
+           {2934, 102, 20, 0, 0},       {13191, 1433, 198, 0, 0},
+           {20515, 6259, 1646, 0, 0},   {24777, 11996, 5057, 0, 0},
+           {27091, 16858, 9709, 0, 0},  {2659, 236, 48, 0, 0},
+           {16021, 2602, 516, 0, 0},    {22634, 9226, 3584, 0, 0},
+           {26977, 16592, 9212, 0, 0},  {28406, 22354, 15484, 0, 0},
+           {3276, 142, 20, 0, 0},       {12874, 1366, 243, 0, 0},
+           {19826, 5697, 1899, 0, 0},   {24422, 11552, 5363, 0, 0},
+           {26196, 15681, 8909, 0, 0},  {733, 33, 20, 0, 0},
+           {9811, 930, 150, 0, 0},      {18044, 4196, 996, 0, 0},
+           {22404, 8769, 3215, 0, 0},   {25764, 14335, 7113, 0, 0},
+           {5240, 491, 87, 0, 0},       {15809, 1597, 672, 0, 0},
+           {22282, 9175, 4806, 0, 0},   {24576, 16384, 9557, 0, 0},
+           {23831, 14895, 11916, 0, 0}, {5053, 766, 153, 0, 0},
+           {17695, 3277, 1092, 0, 0},   {21504, 8192, 4096, 0, 0},
+           {30427, 14043, 9362, 0, 0},  {25486, 14564, 7282, 0, 0},
+           {4221, 555, 111, 0, 0},      {11980, 2995, 529, 0, 0},
+           {25988, 11299, 2260, 0, 0},  {26810, 17873, 8937, 0, 0},
+           {16384, 10923, 5461, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{26776, 18464, 13003, 0, 0}, {10156, 1530, 312, 0, 0},
+           {19312, 5606, 1681, 0, 0},   {24767, 12706, 6264, 0, 0},
+           {27600, 18663, 12004, 0, 0}, {30136, 24997, 20383, 0, 0},
+           {5734, 424, 59, 0, 0},       {16918, 3353, 771, 0, 0},
+           {23274, 9992, 3927, 0, 0},   {26617, 15938, 8799, 0, 0},
+           {29307, 22729, 17046, 0, 0}, {2634, 199, 37, 0, 0},
+           {17130, 3346, 823, 0, 0},    {23618, 10903, 4550, 0, 0},
+           {27121, 17049, 10092, 0, 0}, {29366, 22996, 17291, 0, 0},
+           {4238, 182, 33, 0, 0},       {15629, 2470, 476, 0, 0},
+           {22568, 8729, 3083, 0, 0},   {26349, 15094, 7982, 0, 0},
+           {29224, 22543, 16944, 0, 0}, {1435, 42, 20, 0, 0},
+           {12150, 1281, 224, 0, 0},    {19867, 5551, 1536, 0, 0},
+           {24144, 11034, 4597, 0, 0},  {27664, 18577, 12020, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{21562, 11678, 6207, 0, 0},  {4009, 489, 97, 0, 0},
+           {18597, 4816, 1199, 0, 0},   {23025, 9861, 3627, 0, 0},
+           {25897, 14882, 7900, 0, 0},  {27808, 19616, 13453, 0, 0},
+           {1691, 107, 20, 0, 0},       {13368, 1573, 253, 0, 0},
+           {20016, 5910, 1728, 0, 0},   {24398, 10670, 4177, 0, 0},
+           {27311, 17395, 10470, 0, 0}, {1071, 62, 20, 0, 0},
+           {14908, 2111, 435, 0, 0},    {20258, 7956, 3507, 0, 0},
+           {26588, 13644, 8046, 0, 0},  {27727, 19220, 14809, 0, 0},
+           {1216, 52, 20, 0, 0},        {10860, 999, 145, 0, 0},
+           {18298, 4567, 1203, 0, 0},   {23275, 9786, 4160, 0, 0},
+           {25910, 15528, 8631, 0, 0},  {225, 16, 12, 0, 0},
+           {8482, 671, 102, 0, 0},      {16810, 3551, 744, 0, 0},
+           {22561, 8534, 2810, 0, 0},   {25839, 14463, 7116, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28631, 21921, 17086, 0, 0}, {14944, 5767, 2710, 0, 0},
+           {22564, 9972, 4477, 0, 0},   {26692, 16833, 10643, 0, 0},
+           {28916, 21831, 15952, 0, 0}, {30516, 26444, 22637, 0, 0},
+           {6928, 752, 106, 0, 0},      {17659, 4500, 1237, 0, 0},
+           {23383, 10537, 4428, 0, 0},  {26686, 16096, 9289, 0, 0},
+           {29450, 23341, 18087, 0, 0}, {2174, 194, 50, 0, 0},
+           {15932, 3216, 909, 0, 0},    {23212, 10226, 4412, 0, 0},
+           {26463, 16043, 9228, 0, 0},  {29392, 22873, 17584, 0, 0},
+           {3385, 151, 23, 0, 0},       {13877, 1959, 367, 0, 0},
+           {21080, 6826, 2081, 0, 0},   {25300, 13299, 6117, 0, 0},
+           {28859, 21410, 15756, 0, 0}, {1204, 32, 20, 0, 0},
+           {11862, 1157, 168, 0, 0},    {19577, 5147, 1231, 0, 0},
+           {24000, 10739, 4092, 0, 0},  {27689, 18659, 11862, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf
+    [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+    [kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = {
+        {{{{18470, 12050, 8594, 0, 0},  {20232, 13167, 8979, 0, 0},
+           {24056, 17717, 13265, 0, 0}, {26598, 21441, 17334, 0, 0},
+           {28026, 23842, 20230, 0, 0}, {28965, 25451, 22222, 0, 0},
+           {31072, 29451, 27897, 0, 0}, {18376, 12817, 10012, 0, 0},
+           {16790, 9550, 5950, 0, 0},   {20581, 13294, 8879, 0, 0},
+           {23592, 17128, 12509, 0, 0}, {25700, 20113, 15740, 0, 0},
+           {27112, 22326, 18296, 0, 0}, {30188, 27776, 25524, 0, 0},
+           {20632, 14719, 11342, 0, 0}, {18984, 12047, 8287, 0, 0},
+           {21932, 15147, 10868, 0, 0}, {24396, 18324, 13921, 0, 0},
+           {26245, 20989, 16768, 0, 0}, {27431, 22870, 19008, 0, 0},
+           {29734, 26908, 24306, 0, 0}},
+          {{16801, 9863, 6482, 0, 0},   {19234, 12114, 8189, 0, 0},
+           {23264, 16676, 12233, 0, 0}, {25793, 20200, 15865, 0, 0},
+           {27404, 22677, 18748, 0, 0}, {28411, 24398, 20911, 0, 0},
+           {30262, 27834, 25550, 0, 0}, {9736, 3953, 1832, 0, 0},
+           {13228, 6064, 3049, 0, 0},   {17610, 9799, 5671, 0, 0},
+           {21360, 13903, 9118, 0, 0},  {23883, 17320, 12518, 0, 0},
+           {25660, 19915, 15352, 0, 0}, {28537, 24727, 21288, 0, 0},
+           {12945, 6278, 3612, 0, 0},   {13878, 6839, 3836, 0, 0},
+           {17108, 9277, 5335, 0, 0},   {20621, 12992, 8280, 0, 0},
+           {23040, 15994, 11119, 0, 0}, {24849, 18491, 13702, 0, 0},
+           {27328, 22598, 18583, 0, 0}}},
+         {{{18362, 11906, 8354, 0, 0},  {20944, 13861, 9659, 0, 0},
+           {24511, 18375, 13965, 0, 0}, {26908, 22021, 17990, 0, 0},
+           {28293, 24282, 20784, 0, 0}, {29162, 25814, 22725, 0, 0},
+           {31032, 29358, 27720, 0, 0}, {18338, 12722, 9886, 0, 0},
+           {17175, 9869, 6059, 0, 0},   {20666, 13400, 8957, 0, 0},
+           {23709, 17184, 12506, 0, 0}, {25769, 20165, 15720, 0, 0},
+           {27084, 22271, 18215, 0, 0}, {29946, 27330, 24906, 0, 0},
+           {16983, 11183, 8409, 0, 0},  {14421, 7539, 4502, 0, 0},
+           {17794, 10281, 6379, 0, 0},  {21345, 14087, 9497, 0, 0},
+           {23905, 17418, 12760, 0, 0}, {25615, 19916, 15490, 0, 0},
+           {29061, 25732, 22786, 0, 0}},
+          {{17308, 11072, 7299, 0, 0},  {20598, 13519, 9577, 0, 0},
+           {24045, 17741, 13436, 0, 0}, {26340, 21064, 16894, 0, 0},
+           {27846, 23476, 19716, 0, 0}, {28629, 25073, 21758, 0, 0},
+           {30477, 28260, 26170, 0, 0}, {12912, 5848, 2940, 0, 0},
+           {14845, 7479, 3976, 0, 0},   {18490, 10800, 6471, 0, 0},
+           {21858, 14632, 9818, 0, 0},  {24345, 17953, 13141, 0, 0},
+           {25997, 20485, 15994, 0, 0}, {28694, 25018, 21687, 0, 0},
+           {12916, 6694, 4096, 0, 0},   {13397, 6658, 3779, 0, 0},
+           {16503, 8895, 5105, 0, 0},   {20010, 12390, 7816, 0, 0},
+           {22673, 15670, 10807, 0, 0}, {24518, 18140, 13317, 0, 0},
+           {27563, 23023, 19146, 0, 0}}},
+         {{{22205, 16535, 13005, 0, 0}, {22974, 16746, 12964, 0, 0},
+           {26018, 20823, 17009, 0, 0}, {27805, 23582, 20016, 0, 0},
+           {28923, 25333, 22141, 0, 0}, {29717, 26683, 23934, 0, 0},
+           {31457, 30172, 28938, 0, 0}, {21522, 16364, 13079, 0, 0},
+           {20453, 13857, 10037, 0, 0}, {22211, 15673, 11479, 0, 0},
+           {24632, 18762, 14519, 0, 0}, {26420, 21294, 17203, 0, 0},
+           {27572, 23113, 19368, 0, 0}, {30419, 28242, 26181, 0, 0},
+           {19431, 14038, 11199, 0, 0}, {13462, 6697, 3886, 0, 0},
+           {16816, 9228, 5514, 0, 0},   {20359, 12834, 8338, 0, 0},
+           {23008, 16062, 11379, 0, 0}, {24764, 18548, 13950, 0, 0},
+           {28630, 24974, 21807, 0, 0}},
+          {{21898, 16084, 11819, 0, 0}, {23104, 17538, 14088, 0, 0},
+           {25882, 20659, 17360, 0, 0}, {27943, 23868, 20463, 0, 0},
+           {29138, 25606, 22454, 0, 0}, {29732, 26339, 23381, 0, 0},
+           {31097, 29472, 27828, 0, 0}, {18949, 13609, 9742, 0, 0},
+           {20784, 13660, 9648, 0, 0},  {22078, 15558, 11105, 0, 0},
+           {24784, 18614, 14435, 0, 0}, {25900, 20474, 16644, 0, 0},
+           {27494, 23774, 19900, 0, 0}, {29780, 26997, 24344, 0, 0},
+           {13032, 6121, 3627, 0, 0},   {13835, 6698, 3784, 0, 0},
+           {16989, 9720, 5568, 0, 0},   {20130, 12707, 8236, 0, 0},
+           {22076, 15223, 10548, 0, 0}, {23551, 17517, 12714, 0, 0},
+           {27690, 23484, 20174, 0, 0}}},
+         {{{30437, 29106, 27524, 0, 0}, {29877, 27997, 26623, 0, 0},
+           {28170, 25145, 23039, 0, 0}, {29248, 25923, 23569, 0, 0},
+           {29351, 26649, 23444, 0, 0}, {30167, 27356, 25383, 0, 0},
+           {32168, 31595, 31024, 0, 0}, {25096, 19482, 15299, 0, 0},
+           {28536, 24976, 21975, 0, 0}, {29853, 27451, 25371, 0, 0},
+           {30450, 28412, 26616, 0, 0}, {30641, 28768, 27214, 0, 0},
+           {30918, 29290, 27493, 0, 0}, {31791, 30835, 29925, 0, 0},
+           {14488, 8381, 4779, 0, 0},   {16916, 10097, 6583, 0, 0},
+           {18923, 11817, 7979, 0, 0},  {21713, 14802, 10639, 0, 0},
+           {23630, 17346, 12967, 0, 0}, {25314, 19623, 15312, 0, 0},
+           {29398, 26375, 23755, 0, 0}},
+          {{26926, 23539, 21930, 0, 0}, {30455, 29277, 28492, 0, 0},
+           {29770, 26664, 25272, 0, 0}, {30348, 25321, 22900, 0, 0},
+           {29734, 24273, 21845, 0, 0}, {28692, 23831, 21793, 0, 0},
+           {31682, 30398, 29469, 0, 0}, {23054, 15514, 12324, 0, 0},
+           {24225, 19070, 15645, 0, 0}, {27850, 23761, 20858, 0, 0},
+           {28639, 25236, 22215, 0, 0}, {30404, 27235, 24710, 0, 0},
+           {30934, 29222, 27205, 0, 0}, {31295, 29860, 28635, 0, 0},
+           {17363, 11575, 7149, 0, 0},  {17077, 10816, 6207, 0, 0},
+           {19806, 13574, 8603, 0, 0},  {22496, 14913, 10639, 0, 0},
+           {24180, 17498, 12050, 0, 0}, {24086, 18099, 13268, 0, 0},
+           {27898, 23132, 19563, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{17773, 11427, 8019, 0, 0},  {19610, 12479, 8167, 0, 0},
+           {23827, 17442, 12892, 0, 0}, {26471, 21227, 16961, 0, 0},
+           {27951, 23739, 19992, 0, 0}, {29037, 25495, 22141, 0, 0},
+           {30921, 29151, 27414, 0, 0}, {18296, 13109, 10425, 0, 0},
+           {15962, 8606, 5235, 0, 0},   {19868, 12364, 8055, 0, 0},
+           {23357, 16656, 11971, 0, 0}, {25712, 20071, 15620, 0, 0},
+           {27224, 22429, 18308, 0, 0}, {29814, 27064, 24449, 0, 0},
+           {20304, 14697, 11414, 0, 0}, {17286, 10240, 6734, 0, 0},
+           {20698, 13499, 9144, 0, 0},  {23815, 17362, 12662, 0, 0},
+           {25741, 20038, 15548, 0, 0}, {26881, 21855, 17628, 0, 0},
+           {28975, 25490, 22321, 0, 0}},
+          {{17197, 10536, 7019, 0, 0},  {18262, 11193, 7394, 0, 0},
+           {22579, 15679, 11199, 0, 0}, {25452, 19467, 14853, 0, 0},
+           {26985, 21856, 17578, 0, 0}, {28008, 23613, 19680, 0, 0},
+           {29775, 26802, 23994, 0, 0}, {9344, 3865, 1990, 0, 0},
+           {11993, 5102, 2478, 0, 0},   {16294, 8358, 4469, 0, 0},
+           {20297, 12588, 7781, 0, 0},  {23358, 16281, 11329, 0, 0},
+           {25232, 19154, 14239, 0, 0}, {27720, 23182, 19219, 0, 0},
+           {11678, 5478, 3012, 0, 0},   {11972, 5366, 2742, 0, 0},
+           {14949, 7283, 3799, 0, 0},   {18908, 10859, 6306, 0, 0},
+           {21766, 14274, 9239, 0, 0},  {23815, 16839, 11871, 0, 0},
+           {26320, 20850, 16314, 0, 0}}},
+         {{{16769, 10560, 7319, 0, 0},  {19718, 12780, 8646, 0, 0},
+           {24174, 17904, 13390, 0, 0}, {26735, 21689, 17530, 0, 0},
+           {28214, 24085, 20421, 0, 0}, {29096, 25629, 22431, 0, 0},
+           {30868, 28997, 27192, 0, 0}, {16980, 11428, 8819, 0, 0},
+           {15943, 8533, 5010, 0, 0},   {19895, 12366, 7958, 0, 0},
+           {23178, 16405, 11674, 0, 0}, {25416, 19559, 15035, 0, 0},
+           {26808, 21779, 17584, 0, 0}, {29536, 26534, 23761, 0, 0},
+           {17007, 12052, 9544, 0, 0},  {13450, 6779, 4009, 0, 0},
+           {17239, 9674, 5839, 0, 0},   {21106, 13779, 9127, 0, 0},
+           {23813, 17200, 12402, 0, 0}, {25487, 19662, 15060, 0, 0},
+           {28520, 24709, 21328, 0, 0}},
+          {{17869, 11551, 8265, 0, 0},  {19249, 12485, 8721, 0, 0},
+           {23339, 16802, 12403, 0, 0}, {26068, 20413, 16116, 0, 0},
+           {27680, 23064, 19052, 0, 0}, {28525, 24614, 21037, 0, 0},
+           {30066, 27404, 24907, 0, 0}, {10023, 4380, 2314, 0, 0},
+           {12533, 5622, 2846, 0, 0},   {16872, 9053, 5131, 0, 0},
+           {20928, 13418, 8637, 0, 0},  {23646, 16836, 11888, 0, 0},
+           {25280, 19187, 14406, 0, 0}, {27654, 23200, 19398, 0, 0},
+           {11923, 6215, 3836, 0, 0},   {11787, 5396, 2884, 0, 0},
+           {14987, 7433, 3983, 0, 0},   {19008, 11060, 6471, 0, 0},
+           {21793, 14353, 9403, 0, 0},  {23723, 16979, 12082, 0, 0},
+           {26638, 21569, 17345, 0, 0}}},
+         {{{19219, 13044, 9610, 0, 0},  {20924, 14386, 10522, 0, 0},
+           {24849, 19149, 14995, 0, 0}, {27282, 22625, 18822, 0, 0},
+           {28602, 24785, 21444, 0, 0}, {29404, 26262, 23341, 0, 0},
+           {31170, 29608, 28094, 0, 0}, {17487, 11789, 8987, 0, 0},
+           {17829, 10649, 6816, 0, 0},  {21405, 14361, 9956, 0, 0},
+           {24159, 17911, 13398, 0, 0}, {26031, 20584, 16288, 0, 0},
+           {27262, 22505, 18506, 0, 0}, {29778, 26982, 24388, 0, 0},
+           {12519, 7515, 5351, 0, 0},   {11698, 5250, 2767, 0, 0},
+           {15914, 8299, 4694, 0, 0},   {19904, 12282, 7768, 0, 0},
+           {22806, 15790, 10990, 0, 0}, {24694, 18430, 13720, 0, 0},
+           {28274, 24289, 20862, 0, 0}},
+          {{18808, 13151, 9939, 0, 0},  {21618, 15427, 11540, 0, 0},
+           {25618, 19804, 15578, 0, 0}, {27437, 22766, 18901, 0, 0},
+           {28601, 25024, 21711, 0, 0}, {29288, 26139, 23122, 0, 0},
+           {30885, 28984, 27082, 0, 0}, {14016, 7108, 3856, 0, 0},
+           {15800, 8182, 4738, 0, 0},   {19248, 11713, 7455, 0, 0},
+           {22315, 15142, 10488, 0, 0}, {24382, 18263, 13652, 0, 0},
+           {26026, 20173, 15760, 0, 0}, {28495, 24628, 21269, 0, 0},
+           {10648, 4941, 2535, 0, 0},   {12205, 5410, 2873, 0, 0},
+           {15692, 8124, 4615, 0, 0},   {19406, 11826, 7459, 0, 0},
+           {21974, 14803, 10073, 0, 0}, {23754, 17116, 12449, 0, 0},
+           {27060, 22256, 18271, 0, 0}}},
+         {{{27063, 21838, 17043, 0, 0}, {24822, 20003, 16653, 0, 0},
+           {25967, 20645, 16542, 0, 0}, {27306, 22633, 18568, 0, 0},
+           {28579, 24757, 21261, 0, 0}, {29577, 26539, 23360, 0, 0},
+           {31711, 30631, 29556, 0, 0}, {22750, 15701, 11277, 0, 0},
+           {25388, 20186, 16315, 0, 0}, {26700, 21923, 18429, 0, 0},
+           {27670, 23570, 20213, 0, 0}, {28456, 24758, 21649, 0, 0},
+           {29068, 25802, 22987, 0, 0}, {31075, 29442, 27881, 0, 0},
+           {14011, 7838, 4994, 0, 0},   {15120, 8172, 4951, 0, 0},
+           {18061, 10716, 6742, 0, 0},  {21048, 13916, 9476, 0, 0},
+           {23411, 16816, 12243, 0, 0}, {24958, 19015, 14558, 0, 0},
+           {28889, 25435, 22440, 0, 0}},
+          {{24490, 19526, 16846, 0, 0}, {22221, 16901, 13849, 0, 0},
+           {23662, 16926, 12159, 0, 0}, {25935, 19761, 15550, 0, 0},
+           {27957, 23056, 18845, 0, 0}, {28783, 25416, 21640, 0, 0},
+           {31080, 29310, 27506, 0, 0}, {19817, 10907, 6258, 0, 0},
+           {22980, 16724, 12492, 0, 0}, {26459, 21524, 17898, 0, 0},
+           {27585, 23419, 20202, 0, 0}, {28379, 24539, 21276, 0, 0},
+           {29135, 25823, 22148, 0, 0}, {29168, 25921, 22861, 0, 0},
+           {11020, 4631, 2513, 0, 0},   {13332, 6187, 3208, 0, 0},
+           {16409, 8567, 4815, 0, 0},   {18807, 11075, 6897, 0, 0},
+           {21224, 14082, 9446, 0, 0},  {23396, 16306, 11816, 0, 0},
+           {26630, 21558, 17378, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{16630, 10545, 7259, 0, 0},  {17421, 10338, 6436, 0, 0},
+           {23154, 16032, 11436, 0, 0}, {26168, 20493, 15861, 0, 0},
+           {27957, 23344, 19221, 0, 0}, {29020, 24959, 21348, 0, 0},
+           {30514, 28181, 25878, 0, 0}, {17572, 12484, 9591, 0, 0},
+           {14451, 7299, 4317, 0, 0},   {18850, 11117, 6926, 0, 0},
+           {22716, 15618, 10773, 0, 0}, {25269, 19138, 14181, 0, 0},
+           {26610, 21351, 16765, 0, 0}, {28754, 24983, 21516, 0, 0},
+           {17720, 11701, 8384, 0, 0},  {14566, 7422, 4215, 0, 0},
+           {18466, 10749, 6412, 0, 0},  {21929, 14629, 9602, 0, 0},
+           {24053, 17024, 11962, 0, 0}, {25232, 19192, 14224, 0, 0},
+           {27355, 22433, 18270, 0, 0}},
+          {{15374, 8267, 4873, 0, 0},   {16879, 9348, 5583, 0, 0},
+           {21207, 13635, 8898, 0, 0},  {24483, 17956, 12924, 0, 0},
+           {26272, 20725, 16218, 0, 0}, {27997, 23194, 19091, 0, 0},
+           {29165, 25938, 22624, 0, 0}, {11112, 5064, 2568, 0, 0},
+           {11444, 4853, 2257, 0, 0},   {15441, 7432, 3771, 0, 0},
+           {19351, 11387, 6735, 0, 0},  {22636, 15343, 10430, 0, 0},
+           {24188, 17752, 13135, 0, 0}, {27074, 21291, 16357, 0, 0},
+           {8652, 2988, 1318, 0, 0},    {8915, 3073, 1177, 0, 0},
+           {12683, 5154, 2340, 0, 0},   {17442, 8433, 4193, 0, 0},
+           {20954, 13296, 7958, 0, 0},  {22547, 14157, 8001, 0, 0},
+           {25079, 18210, 12447, 0, 0}}},
+         {{{16554, 10388, 6998, 0, 0},  {18555, 11464, 7473, 0, 0},
+           {23555, 16945, 12313, 0, 0}, {26373, 21010, 16629, 0, 0},
+           {27989, 23581, 19702, 0, 0}, {28947, 25267, 21815, 0, 0},
+           {30475, 28201, 25973, 0, 0}, {16909, 11485, 8948, 0, 0},
+           {14364, 7166, 4042, 0, 0},   {18443, 10788, 6562, 0, 0},
+           {22099, 14831, 10048, 0, 0}, {24471, 18126, 13321, 0, 0},
+           {26022, 20379, 15875, 0, 0}, {28444, 24517, 20998, 0, 0},
+           {16236, 11137, 8293, 0, 0},  {12101, 5618, 3100, 0, 0},
+           {16040, 8258, 4593, 0, 0},   {19907, 12123, 7436, 0, 0},
+           {22692, 15407, 10351, 0, 0}, {24373, 17828, 12805, 0, 0},
+           {27037, 22085, 17856, 0, 0}},
+          {{18335, 11613, 7830, 0, 0},  {18110, 11052, 7223, 0, 0},
+           {22845, 15944, 11211, 0, 0}, {25786, 19716, 15047, 0, 0},
+           {27349, 22265, 17718, 0, 0}, {27916, 23606, 19754, 0, 0},
+           {29497, 26373, 23138, 0, 0}, {10558, 4935, 2659, 0, 0},
+           {12018, 5400, 2947, 0, 0},   {15874, 7940, 4195, 0, 0},
+           {19521, 11492, 7011, 0, 0},  {22730, 15503, 10205, 0, 0},
+           {24181, 17821, 12441, 0, 0}, {27123, 21397, 17516, 0, 0},
+           {10741, 5242, 3054, 0, 0},   {9670, 3622, 1547, 0, 0},
+           {12882, 5427, 2496, 0, 0},   {17159, 9021, 4722, 0, 0},
+           {20775, 12703, 7829, 0, 0},  {23131, 14501, 9097, 0, 0},
+           {25143, 18967, 13624, 0, 0}}},
+         {{{18330, 11970, 8679, 0, 0},  {20147, 13565, 9671, 0, 0},
+           {24591, 18643, 14366, 0, 0}, {27094, 22267, 18312, 0, 0},
+           {28532, 24529, 21035, 0, 0}, {29321, 26018, 22962, 0, 0},
+           {30782, 28818, 26904, 0, 0}, {16560, 10669, 7838, 0, 0},
+           {16231, 8743, 5183, 0, 0},   {19988, 12387, 7901, 0, 0},
+           {23001, 16156, 11352, 0, 0}, {25082, 19030, 14370, 0, 0},
+           {26435, 21154, 16804, 0, 0}, {28827, 25197, 21932, 0, 0},
+           {9949, 5346, 3566, 0, 0},    {10544, 4254, 2047, 0, 0},
+           {15108, 7335, 3855, 0, 0},   {19194, 11286, 6766, 0, 0},
+           {22139, 14791, 9830, 0, 0},  {24156, 17470, 12503, 0, 0},
+           {27161, 22277, 18172, 0, 0}},
+          {{19199, 12968, 9562, 0, 0},  {19640, 12844, 8899, 0, 0},
+           {24439, 17927, 13365, 0, 0}, {26638, 21792, 17711, 0, 0},
+           {28086, 23929, 20250, 0, 0}, {29112, 25359, 22180, 0, 0},
+           {30191, 27669, 25356, 0, 0}, {10341, 4084, 2183, 0, 0},
+           {11855, 5018, 2629, 0, 0},   {16928, 8659, 4934, 0, 0},
+           {20460, 12739, 8199, 0, 0},  {22552, 15983, 11310, 0, 0},
+           {24459, 18565, 13655, 0, 0}, {26725, 21600, 17461, 0, 0},
+           {9602, 3867, 1770, 0, 0},    {10869, 4363, 2017, 0, 0},
+           {14355, 6677, 3325, 0, 0},   {17535, 9654, 5416, 0, 0},
+           {20085, 12296, 7480, 0, 0},  {22066, 14509, 9359, 0, 0},
+           {24643, 18304, 13542, 0, 0}}},
+         {{{23728, 17982, 14408, 0, 0}, {22789, 17050, 13353, 0, 0},
+           {24855, 18850, 14457, 0, 0}, {26909, 21879, 17584, 0, 0},
+           {28175, 24091, 20258, 0, 0}, {28948, 25372, 21977, 0, 0},
+           {31038, 29297, 27576, 0, 0}, {20965, 14403, 10059, 0, 0},
+           {21349, 14710, 10543, 0, 0}, {23350, 16994, 12525, 0, 0},
+           {25229, 19443, 15111, 0, 0}, {26535, 21451, 17384, 0, 0},
+           {27631, 23112, 19223, 0, 0}, {29791, 26994, 24419, 0, 0},
+           {11561, 5522, 3128, 0, 0},   {13221, 6190, 3271, 0, 0},
+           {16599, 8897, 5078, 0, 0},   {19948, 12310, 7750, 0, 0},
+           {22544, 15436, 10554, 0, 0}, {24242, 17720, 12884, 0, 0},
+           {27731, 23358, 19650, 0, 0}},
+          {{20429, 15439, 12628, 0, 0}, {19263, 12873, 9543, 0, 0},
+           {22921, 15824, 11204, 0, 0}, {25488, 19512, 14420, 0, 0},
+           {28056, 22759, 18314, 0, 0}, {28407, 24854, 20291, 0, 0},
+           {29898, 27140, 24773, 0, 0}, {12707, 7264, 4242, 0, 0},
+           {17533, 9890, 6623, 0, 0},   {19783, 12810, 8613, 0, 0},
+           {22986, 16127, 11365, 0, 0}, {23312, 16408, 12008, 0, 0},
+           {25913, 19828, 14211, 0, 0}, {27107, 22204, 17766, 0, 0},
+           {7112, 2166, 874, 0, 0},     {10198, 3661, 1676, 0, 0},
+           {13851, 6345, 3227, 0, 0},   {16828, 9119, 5014, 0, 0},
+           {19965, 12187, 7549, 0, 0},  {21686, 14073, 9392, 0, 0},
+           {24829, 18395, 13763, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{14453, 8479, 5217, 0, 0},   {15914, 8700, 4933, 0, 0},
+           {22628, 14841, 9595, 0, 0},  {26046, 19786, 14501, 0, 0},
+           {28107, 22942, 18062, 0, 0}, {28936, 24603, 20474, 0, 0},
+           {29973, 26670, 23523, 0, 0}, {15623, 9442, 6096, 0, 0},
+           {12035, 5088, 2460, 0, 0},   {16736, 8307, 4222, 0, 0},
+           {21115, 12675, 7687, 0, 0},  {23478, 16339, 10682, 0, 0},
+           {24972, 18170, 12786, 0, 0}, {26266, 20390, 15327, 0, 0},
+           {11087, 5036, 2448, 0, 0},   {10379, 3724, 1507, 0, 0},
+           {13741, 6037, 2681, 0, 0},   {18029, 9013, 4144, 0, 0},
+           {21410, 11990, 7257, 0, 0},  {21773, 14695, 8578, 0, 0},
+           {23606, 17778, 12151, 0, 0}},
+          {{11343, 4816, 2380, 0, 0},   {14706, 6930, 3734, 0, 0},
+           {20812, 12887, 7960, 0, 0},  {25050, 17768, 11788, 0, 0},
+           {27066, 21514, 16625, 0, 0}, {27870, 23680, 15904, 0, 0},
+           {29089, 25992, 20861, 0, 0}, {9474, 2608, 1105, 0, 0},
+           {8371, 2872, 932, 0, 0},     {13523, 5640, 2175, 0, 0},
+           {19566, 12943, 6364, 0, 0},  {21190, 13471, 8811, 0, 0},
+           {24695, 19471, 11398, 0, 0}, {27307, 21845, 13023, 0, 0},
+           {5401, 2247, 834, 0, 0},     {7864, 2097, 828, 0, 0},
+           {9693, 4308, 1469, 0, 0},    {18368, 9110, 2351, 0, 0},
+           {18883, 8886, 4443, 0, 0},   {18022, 9830, 4915, 0, 0},
+           {27307, 16384, 5461, 0, 0}}},
+         {{{14494, 7955, 4878, 0, 0},   {17231, 9619, 5765, 0, 0},
+           {23319, 16028, 10941, 0, 0}, {26068, 20270, 15507, 0, 0},
+           {27780, 22902, 18570, 0, 0}, {28532, 24621, 20866, 0, 0},
+           {29901, 26908, 24114, 0, 0}, {15644, 9597, 6667, 0, 0},
+           {12372, 5291, 2620, 0, 0},   {16195, 8139, 4276, 0, 0},
+           {20019, 11922, 7094, 0, 0},  {22535, 14890, 9950, 0, 0},
+           {24243, 17436, 12405, 0, 0}, {26485, 21136, 16513, 0, 0},
+           {12302, 6257, 3482, 0, 0},   {9709, 3594, 1577, 0, 0},
+           {13287, 5505, 2527, 0, 0},   {17310, 9137, 4631, 0, 0},
+           {20352, 12160, 7075, 0, 0},  {22507, 14757, 9507, 0, 0},
+           {24752, 18113, 13102, 0, 0}},
+          {{15152, 8182, 4656, 0, 0},   {16959, 9469, 5613, 0, 0},
+           {22001, 13878, 8975, 0, 0},  {25041, 18513, 13903, 0, 0},
+           {26639, 20842, 15886, 0, 0}, {28286, 23064, 17907, 0, 0},
+           {29491, 25316, 21246, 0, 0}, {9812, 4217, 2038, 0, 0},
+           {10044, 3831, 1807, 0, 0},   {14301, 6444, 3188, 0, 0},
+           {19534, 12055, 7119, 0, 0},  {21587, 15176, 10287, 0, 0},
+           {24477, 14410, 8192, 0, 0},  {25200, 20887, 17784, 0, 0},
+           {7820, 3767, 1621, 0, 0},    {7094, 2149, 617, 0, 0},
+           {11927, 5975, 3165, 0, 0},   {18099, 8412, 4102, 0, 0},
+           {21434, 9175, 4549, 0, 0},   {23846, 18006, 9895, 0, 0},
+           {24467, 19224, 12233, 0, 0}}},
+         {{{15655, 9035, 5687, 0, 0},   {18629, 11362, 7316, 0, 0},
+           {24216, 17766, 12992, 0, 0}, {26897, 21648, 17390, 0, 0},
+           {28313, 24152, 20515, 0, 0}, {29299, 25858, 22382, 0, 0},
+           {30513, 28215, 25986, 0, 0}, {14544, 8392, 5715, 0, 0},
+           {13478, 6058, 3154, 0, 0},   {17832, 9777, 5584, 0, 0},
+           {21530, 13817, 9006, 0, 0},  {23982, 17151, 12180, 0, 0},
+           {25451, 19540, 14765, 0, 0}, {27667, 23256, 19275, 0, 0},
+           {10129, 4546, 2558, 0, 0},   {9552, 3437, 1461, 0, 0},
+           {13693, 6006, 2873, 0, 0},   {17754, 9655, 5311, 0, 0},
+           {20830, 12911, 8016, 0, 0},  {22826, 15488, 10486, 0, 0},
+           {25601, 19624, 15016, 0, 0}},
+          {{16948, 10030, 6280, 0, 0},  {19238, 11883, 7552, 0, 0},
+           {24373, 17238, 12316, 0, 0}, {26194, 20447, 16388, 0, 0},
+           {27415, 22349, 18200, 0, 0}, {28155, 24322, 20387, 0, 0},
+           {29328, 25610, 22865, 0, 0}, {8521, 3717, 1544, 0, 0},
+           {10650, 4710, 2399, 0, 0},   {16270, 8000, 4379, 0, 0},
+           {19848, 11593, 6631, 0, 0},  {22038, 14149, 7416, 0, 0},
+           {22581, 16489, 9977, 0, 0},  {23458, 18137, 10641, 0, 0},
+           {7798, 2210, 711, 0, 0},     {7967, 2826, 1070, 0, 0},
+           {10336, 4315, 1913, 0, 0},   {13714, 7088, 3188, 0, 0},
+           {18376, 9732, 4659, 0, 0},   {20273, 11821, 6118, 0, 0},
+           {20326, 12442, 6554, 0, 0}}},
+         {{{20606, 13983, 10120, 0, 0}, {20019, 13071, 8962, 0, 0},
+           {24188, 17471, 12422, 0, 0}, {26599, 21019, 16225, 0, 0},
+           {27932, 23377, 19320, 0, 0}, {28947, 25057, 21155, 0, 0},
+           {30540, 28167, 25698, 0, 0}, {16449, 8043, 4488, 0, 0},
+           {17070, 9491, 5600, 0, 0},   {20042, 12400, 7721, 0, 0},
+           {22856, 15753, 10792, 0, 0}, {24880, 18548, 13589, 0, 0},
+           {25991, 20484, 15750, 0, 0}, {28276, 24178, 20516, 0, 0},
+           {9519, 3864, 1821, 0, 0},    {11718, 4860, 2256, 0, 0},
+           {15328, 7428, 3819, 0, 0},   {18709, 10750, 6227, 0, 0},
+           {21480, 13865, 8870, 0, 0},  {23357, 16426, 11340, 0, 0},
+           {26490, 21180, 16824, 0, 0}},
+          {{18787, 12701, 9542, 0, 0},  {15846, 9188, 5985, 0, 0},
+           {21763, 13729, 8281, 0, 0},  {25379, 18550, 12970, 0, 0},
+           {27170, 21263, 15562, 0, 0}, {26678, 21555, 17109, 0, 0},
+           {28948, 25397, 22649, 0, 0}, {11686, 5843, 3093, 0, 0},
+           {11506, 4141, 1640, 0, 0},   {14376, 6314, 2331, 0, 0},
+           {17898, 9858, 5672, 0, 0},   {20148, 13284, 7860, 0, 0},
+           {23478, 16215, 9966, 0, 0},  {26100, 18480, 12764, 0, 0},
+           {5064, 1713, 819, 0, 0},     {8059, 2790, 980, 0, 0},
+           {11100, 3504, 1111, 0, 0},   {14473, 5800, 2694, 0, 0},
+           {16369, 8346, 3455, 0, 0},   {18421, 9742, 4664, 0, 0},
+           {20398, 12962, 8291, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                                [kDcSignContexts][kBooleanFieldCdfSize] = {
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}}
+};
+/* clang-format on */
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187,
+                                                                   0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+                          [kBooleanFieldCdfSize] = {
+                              {{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}},
+                              {{856, 0, 0}, {29909, 0, 0}, {31788, 0, 0}},
+                              {{945, 0, 0}, {29368, 0, 0}, {31987, 0, 0}},
+                              {{738, 0, 0}, {29207, 0, 0}, {31864, 0, 0}},
+                              {{459, 0, 0}, {25431, 0, 0}, {31306, 0, 0}},
+                              {{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}},
+                              {{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts]
+                           [kPaletteSizeSymbolCount + 1] = {
+                               {24816, 19768, 14619, 11290, 7241, 3527, 0, 0},
+                               {25629, 21347, 16573, 13224, 9102, 4695, 0, 0},
+                               {24980, 20027, 15443, 12268, 8453, 4238, 0, 0},
+                               {24497, 18704, 14522, 11204, 7697, 4235, 0, 0},
+                               {20043, 13588, 10905, 7929, 5233, 2648, 0, 0},
+                               {23057, 17880, 15845, 11716, 7107, 4893, 0, 0},
+                               {17828, 11971, 11090, 8582, 5735, 3769, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = {
+        {307, 0, 0}, {11280, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts]
+                            [kPaletteSizeSymbolCount + 1] = {
+                                {24055, 12789, 5640, 3159, 1437, 496, 0, 0},
+                                {26929, 17195, 9187, 5821, 2920, 1068, 0, 0},
+                                {28342, 21508, 14769, 11285, 6905, 3338, 0, 0},
+                                {29540, 23304, 17775, 14679, 10245, 5348, 0, 0},
+                                {29000, 23882, 19677, 14916, 10273, 5561, 0, 0},
+                                {30304, 24317, 19907, 11136, 7243, 4213, 0, 0},
+                                {31499, 27333, 22335, 13805, 11068, 6903, 0,
+                                 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf
+    [kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts]
+    [kPaletteColorIndexSymbolCount + 1] = {
+        {{{4058, 0, 0},
+          {16384, 0, 0},
+          {22215, 0, 0},
+          {5732, 0, 0},
+          {1165, 0, 0}},
+         {{4891, 2278, 0, 0},
+          {21236, 7071, 0, 0},
+          {26224, 2534, 0, 0},
+          {9750, 4696, 0, 0},
+          {853, 383, 0, 0}},
+         {{7196, 4722, 2723, 0, 0},
+          {23290, 11178, 5512, 0, 0},
+          {25520, 5931, 2944, 0, 0},
+          {13601, 8282, 4419, 0, 0},
+          {1368, 943, 518, 0, 0}},
+         {{7989, 5813, 4192, 2486, 0, 0},
+          {24099, 12404, 8695, 4675, 0, 0},
+          {28513, 5203, 3391, 1701, 0, 0},
+          {12904, 9094, 6052, 3238, 0, 0},
+          {1122, 875, 621, 342, 0, 0}},
+         {{9636, 7361, 5798, 4333, 2695, 0, 0},
+          {25325, 15526, 12051, 8006, 4786, 0, 0},
+          {26468, 7906, 5824, 3984, 2097, 0, 0},
+          {13852, 9873, 7501, 5333, 3116, 0, 0},
+          {1498, 1218, 960, 709, 415, 0, 0}},
+         {{9663, 7569, 6304, 5084, 3837, 2450, 0, 0},
+          {25818, 17321, 13816, 10087, 7201, 4205, 0, 0},
+          {25208, 9294, 7278, 5565, 3847, 2060, 0, 0},
+          {14224, 10395, 8311, 6573, 4649, 2723, 0, 0},
+          {1570, 1317, 1098, 886, 645, 377, 0, 0}},
+         {{11079, 8885, 7605, 6416, 5262, 3941, 2573, 0, 0},
+          {25876, 17383, 14928, 11162, 8481, 6015, 3564, 0, 0},
+          {27117, 9586, 7726, 6250, 4786, 3376, 1868, 0, 0},
+          {13419, 10190, 8350, 6774, 5244, 3737, 2320, 0, 0},
+          {1740, 1498, 1264, 1063, 841, 615, 376, 0, 0}}},
+        {{{3679, 0, 0},
+          {16384, 0, 0},
+          {24055, 0, 0},
+          {3511, 0, 0},
+          {1158, 0, 0}},
+         {{7511, 3623, 0, 0},
+          {20481, 5475, 0, 0},
+          {25735, 4808, 0, 0},
+          {12623, 7363, 0, 0},
+          {2160, 1129, 0, 0}},
+         {{8558, 5593, 2865, 0, 0},
+          {22880, 10382, 5554, 0, 0},
+          {26867, 6715, 3475, 0, 0},
+          {14450, 10616, 4435, 0, 0},
+          {2309, 1632, 842, 0, 0}},
+         {{9788, 7289, 4987, 2782, 0, 0},
+          {24355, 11360, 7909, 3894, 0, 0},
+          {30511, 3319, 2174, 1170, 0, 0},
+          {13579, 11566, 6853, 4148, 0, 0},
+          {924, 724, 487, 250, 0, 0}},
+         {{10551, 8201, 6131, 4085, 2220, 0, 0},
+          {25461, 16362, 13132, 8136, 4344, 0, 0},
+          {28327, 7704, 5889, 3826, 1849, 0, 0},
+          {15558, 12240, 9449, 6018, 3186, 0, 0},
+          {2094, 1815, 1372, 1033, 561, 0, 0}},
+         {{11529, 9600, 7724, 5806, 4063, 2262, 0, 0},
+          {26223, 17756, 14764, 10951, 7265, 4067, 0, 0},
+          {29320, 6473, 5331, 4064, 2642, 1326, 0, 0},
+          {16879, 14445, 11064, 8070, 5792, 3078, 0, 0},
+          {1780, 1564, 1289, 1034, 785, 443, 0, 0}},
+         {{11326, 9480, 8010, 6522, 5119, 3788, 2205, 0, 0},
+          {26905, 17835, 15216, 12100, 9085, 6357, 3495, 0, 0},
+          {29353, 6958, 5891, 4778, 3545, 2374, 1150, 0, 0},
+          {14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0},
+          {1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = {
+        {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts]
+                                   [kBooleanFieldCdfSize] = {{5940, 0, 0},
+                                                             {8733, 0, 0},
+                                                             {20737, 0, 0},
+                                                             {22128, 0, 0},
+                                                             {29867, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts]
+                                    [kBooleanFieldCdfSize] = {{31570, 0, 0},
+                                                              {30698, 0, 0},
+                                                              {23602, 0, 0},
+                                                              {25269, 0, 0},
+                                                              {10293, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf
+    [kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] =
+        {{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}},
+          {{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}},
+          {{994, 0, 0}, {7648, 0, 0}, {6058, 0, 0}}},
+         {{{27822, 0, 0}, {23300, 0, 0}, {31265, 0, 0}},
+          {{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}},
+          {{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2]
+                                        [kBooleanFieldCdfSize] = {
+                                            {{30533, 0, 0}, {31345, 0, 0}},
+                                            {{15586, 0, 0}, {17593, 0, 0}},
+                                            {{2162, 0, 0}, {2279, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
+                                         [kBooleanFieldCdfSize] = {
+  {{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0},
+   {31324, 0, 0}},
+  {{15795, 0, 0}, {16017, 0, 0}, {13121, 0, 0}, {7995, 0, 0}, {21754, 0, 0},
+   {17681, 0, 0}},
+  {{3024, 0, 0}, {2489, 0, 0}, {1574, 0, 0}, {873, 0, 0}, {5893, 0, 0},
+   {2464, 0, 0}}};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf
+    [kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = {
+        {25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0},
+        {22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0},
+        {22104, 12547, 11180, 9862, 8473, 7381, 4332, 0, 0},
+        {19470, 15784, 12297, 8586, 7701, 7032, 6346, 0, 0},
+        {13864, 9443, 7526, 5336, 4870, 4510, 2010, 0, 0},
+        {22043, 15314, 12644, 9948, 8573, 7600, 6722, 0, 0},
+        {15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0},
+        {19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
+        {8733, 0, 0},  {16138, 0, 0}, {17429, 0, 0},
+        {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0},
+                                                                {31714, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = {
+        {8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0},
+        {4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}};
+
+// This is called drl_mode in the spec where DRL stands for Dynamic Reference
+// List.
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = {
+        {19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = {
+        {5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = {
+        {30893, 21686, 5436, 0, 0},
+        {30295, 22772, 6380, 0, 0},
+        {28530, 21231, 6842, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {12732, 0, 0}, {7811, 0, 0},  {16384, 0, 0}, {16384, 0, 0},
+        {6064, 0, 0},  {5238, 0, 0},  {3204, 0, 0},  {16384, 0, 0},
+        {16384, 0, 0}, {3324, 0, 0},  {5896, 0, 0},  {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = {
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30330, 28328, 26169, 24105, 21763, 19894, 17017, 14674, 12409, 10406,
+         8641, 7066, 5016, 3318, 1597, 0, 0},
+        {31962, 29502, 26763, 26030, 25550, 25401, 24997, 18180, 16445, 15401,
+         14316, 13346, 9929, 6641, 3139, 0, 0},
+        {32614, 31781, 30843, 30717, 30680, 30657, 30617, 9735, 9065, 8484,
+         7783, 7084, 5509, 3885, 1857, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {29989, 29030, 28085, 25555, 24993, 24751, 24113, 18411, 14829, 11436,
+         8248, 5298, 3312, 2239, 1112, 0, 0},
+        {31084, 29143, 27093, 25660, 23466, 21494, 18339, 15624, 13605, 11807,
+         9884, 8297, 6049, 4054, 1891, 0, 0},
+        {31626, 29277, 26491, 25454, 24679, 24413, 23745, 19144, 17399, 16038,
+         14654, 13455, 10247, 6756, 3218, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {31633, 31446, 31275, 30133, 30072, 30031, 29998, 11752, 9833, 7711,
+         5517, 3595, 2679, 1808, 835, 0, 0},
+        {30026, 28573, 27041, 24733, 23788, 23432, 22622, 18644, 15498, 12235,
+         9334, 6796, 4824, 3198, 1352, 0, 0},
+        {31041, 28820, 26667, 24972, 22927, 20424, 17002, 13824, 12130, 10730,
+         8805, 7457, 5780, 4002, 1756, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0},  {16384, 0, 0},
+        {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0},
+        {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0},  {9945, 0, 0},
+        {5889, 0, 0},  {10685, 0, 0}, {2640, 0, 0},  {1754, 0, 0},
+        {1208, 0, 0},  {130, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = {
+        {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+        {21845, 10923, 0, 0}, {25117, 8008, 0, 0},  {28030, 8003, 0, 0},
+        {3969, 1378, 0, 0},   {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
+        {13349, 5958, 0, 0},  {27645, 9162, 0, 0},  {3795, 1174, 0, 0},
+        {6337, 1994, 0, 0},   {21162, 8460, 0, 0},  {6508, 3652, 0, 0},
+        {12408, 4706, 0, 0},  {3026, 1565, 0, 0},   {11089, 5938, 0, 0},
+        {3252, 2067, 0, 0},   {3870, 2371, 0, 0},   {1890, 1433, 0, 0},
+        {261, 210, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts]
+                                     [kBooleanFieldCdfSize] = {
+                                         {6161, 0, 0},  {9877, 0, 0},
+                                         {13928, 0, 0}, {8174, 0, 0},
+                                         {12834, 0, 0}, {10094, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts]
+                                    [kBooleanFieldCdfSize] = {
+                                        {14524, 0, 0}, {19903, 0, 0},
+                                        {25715, 0, 0}, {19509, 0, 0},
+                                        {23434, 0, 0}, {28124, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundTypeCdf[kMaxBlockSizes]
+                           [kNumExplicitCompoundPredictionTypes + 1] = {
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {9337, 0, 0},  {19597, 0, 0},
+                               {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0},
+                               {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
+                               {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf
+    [kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = {
+        {833, 48, 0, 0},      {27200, 49, 0, 0},    {32346, 29830, 0, 0},
+        {4524, 160, 0, 0},    {1562, 815, 0, 0},    {27906, 647, 0, 0},
+        {31998, 31616, 0, 0}, {11879, 7131, 0, 0},  {858, 44, 0, 0},
+        {28648, 56, 0, 0},    {32463, 30521, 0, 0}, {5365, 132, 0, 0},
+        {1746, 759, 0, 0},    {29805, 675, 0, 0},   {32167, 31825, 0, 0},
+        {17799, 11370, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
+        4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount +
+                                                     1] = {
+        {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
+        {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0},
+        {10240, 0, 0}, {8192, 0, 0},  {4096, 0, 0},  {2816, 0, 0},
+        {2816, 0, 0},  {2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0,
+                                                         0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
diff --git a/src/symbol_decoder_context_test.cc b/src/symbol_decoder_context_test.cc
new file mode 100644 (file)
index 0000000..4a0de86
--- /dev/null
@@ -0,0 +1,264 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(SymbolDecoderContextTest, ResetIntraFrameYModeCdf) {
+  // Note these are zero-initialized separately to avoid differences in padding
+  // values added to tables for alignment purposes when comparing the contexts
+  // with memcmp().
+  libgav1::SymbolDecoderContext gold_context = {};
+  libgav1::SymbolDecoderContext context = {};
+  gold_context.Initialize(0);
+  context.Initialize(0);
+  EXPECT_EQ(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+  EXPECT_EQ(context.intra_frame_y_mode_cdf[0][0][0], 32768 - 15588);
+  EXPECT_EQ(context.intra_frame_y_mode_cdf[0][0][1], 32768 - 17027);
+  ++context.intra_frame_y_mode_cdf[0][0][0];
+  --context.intra_frame_y_mode_cdf[0][0][1];
+  EXPECT_NE(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+  context.ResetIntraFrameYModeCdf();
+  EXPECT_EQ(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+}
+
+void ResetAndVerifyCounters(libgav1::SymbolDecoderContext* const context) {
+  libgav1::SymbolDecoderContext gold_context = {};
+  gold_context.Initialize(0);
+  EXPECT_NE(memcmp(&gold_context, context, sizeof(gold_context)), 0);
+  context->ResetCounters();
+  EXPECT_EQ(memcmp(&gold_context, context, sizeof(gold_context)), 0);
+}
+
+TEST(SymbolDecoderContextTest, ResetCounters1d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  context.delta_q_cdf[libgav1::kDeltaSymbolCount] = ++value;
+  context.delta_lf_cdf[libgav1::kDeltaSymbolCount] = ++value;
+  context.intra_block_copy_cdf[libgav1::kBooleanSymbolCount] = ++value;
+  context.cfl_alpha_signs_cdf[libgav1::kCflAlphaSignsSymbolCount] = ++value;
+  context.filter_intra_mode_cdf[libgav1::kNumFilterIntraPredictors] = ++value;
+  context.restoration_type_cdf[libgav1::kRestorationTypeSymbolCount] = ++value;
+  context.use_wiener_cdf[libgav1::kBooleanSymbolCount] = ++value;
+  context.use_sgrproj_cdf[libgav1::kBooleanSymbolCount] = ++value;
+  ResetAndVerifyCounters(&context);
+}
+
+void IncreasePartitionCounters(SymbolDecoderContext* symbol_context,
+                               int value) {
+  const int min_bsize_log2 = k4x4WidthLog2[kBlock8x8];
+  const int max_bsize_log2 = k4x4WidthLog2[kBlock128x128];
+  for (int block_size_log2 = min_bsize_log2; block_size_log2 <= max_bsize_log2;
+       ++block_size_log2) {
+    for (int context = 0; context < kPartitionContexts; ++context) {
+      const int cdf_size =
+          SymbolDecoderContext::PartitionCdfSize(block_size_log2);
+      symbol_context->partition_cdf[block_size_log2 - min_bsize_log2][context]
+                                   [cdf_size] += value;
+    }
+  }
+}
+
+void IncreasePaletteColorIndexCounters(SymbolDecoderContext* symbol_context,
+                                       int value) {
+  for (auto& palette_color_index_cdf_plane :
+       symbol_context->palette_color_index_cdf) {
+    for (int symbol_count = 0; symbol_count < kPaletteSizeSymbolCount;
+         ++symbol_count) {
+      const int cdf_size = symbol_count + kMinPaletteSize;
+      for (int context = 0; context < kPaletteColorIndexContexts; ++context) {
+        palette_color_index_cdf_plane[symbol_count][context][cdf_size] += value;
+      }
+    }
+  }
+}
+
+void IncreaseTxTypeCounters(SymbolDecoderContext* context, int value) {
+  for (int set_idx = kTransformSetIntra1; set_idx <= kTransformSetIntra2;
+       ++set_idx) {
+    auto tx_set = static_cast<TransformSet>(set_idx);
+    for (int tx_size = 0; tx_size < kNumExtendedTransformSizes; ++tx_size) {
+      for (int mode = 0; mode < kIntraPredictionModesY; ++mode) {
+        context->intra_tx_type_cdf[SymbolDecoderContext::TxTypeIndex(
+            tx_set)][tx_size][mode][kNumTransformTypesInSet[tx_set]] += value;
+      }
+    }
+  }
+
+  for (int set_idx = kTransformSetInter1; set_idx <= kTransformSetInter3;
+       ++set_idx) {
+    auto tx_set = static_cast<TransformSet>(set_idx);
+    for (int tx_size = 0; tx_size < kNumExtendedTransformSizes; ++tx_size) {
+      context->inter_tx_type_cdf[SymbolDecoderContext::TxTypeIndex(tx_set)]
+                                [tx_size][kNumTransformTypesInSet[tx_set]] +=
+          value;
+    }
+  }
+}
+
+void IncreaseTxDepthCounters(SymbolDecoderContext* symbol_context, int value) {
+  for (int context = 0; context < kTxDepthContexts; ++context) {
+    symbol_context->tx_depth_cdf[0][context][kMaxTxDepthSymbolCount - 1] +=
+        value;
+  }
+
+  for (int plane_category = 1; plane_category < 4; ++plane_category) {
+    for (int context = 0; context < kTxDepthContexts; ++context) {
+      symbol_context
+          ->tx_depth_cdf[plane_category][context][kMaxTxDepthSymbolCount] +=
+          value;
+    }
+  }
+}
+
+void IncreaseUVModeCounters(SymbolDecoderContext* symbol_context, int value) {
+  for (int cfl_allowed = 0; cfl_allowed < kBooleanSymbolCount; ++cfl_allowed) {
+    for (int mode = 0; mode < kIntraPredictionModesY; ++mode) {
+      symbol_context->uv_mode_cdf[cfl_allowed][mode][kIntraPredictionModesUV -
+                                                     (1 - cfl_allowed)] +=
+          value;
+    }
+  }
+}
+
+#define ASSIGN_COUNTER_2D(array, offset) \
+  do {                                   \
+    for (auto& d1 : context.array) {     \
+      d1[libgav1::offset] = ++value;     \
+    }                                    \
+  } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters2d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  ASSIGN_COUNTER_2D(segment_id_cdf, kMaxSegments);
+  ASSIGN_COUNTER_2D(use_predicted_segment_id_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(skip_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(skip_mode_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(delta_lf_multi_cdf, kDeltaSymbolCount);
+  ASSIGN_COUNTER_2D(y_mode_cdf, kIntraPredictionModesY);
+  ASSIGN_COUNTER_2D(angle_delta_cdf, kAngleDeltaSymbolCount);
+  ASSIGN_COUNTER_2D(cfl_alpha_cdf, kCflAlphaSymbolCount);
+  ASSIGN_COUNTER_2D(use_filter_intra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(tx_split_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(eob_pt_512_cdf, kEobPt512SymbolCount);
+  ASSIGN_COUNTER_2D(eob_pt_1024_cdf, kEobPt1024SymbolCount);
+  ASSIGN_COUNTER_2D(palette_y_size_cdf, kPaletteSizeSymbolCount);
+  ASSIGN_COUNTER_2D(has_palette_uv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(palette_uv_size_cdf, kPaletteSizeSymbolCount);
+  ASSIGN_COUNTER_2D(is_inter_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(use_compound_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(compound_reference_type_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(compound_prediction_mode_cdf,
+                    kNumCompoundInterPredictionModes);
+  ASSIGN_COUNTER_2D(new_mv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(zero_mv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(reference_mv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(ref_mv_index_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(is_inter_intra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(inter_intra_mode_cdf, kNumInterIntraModes);
+  ASSIGN_COUNTER_2D(is_wedge_inter_intra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(wedge_index_cdf, kWedgeIndexSymbolCount);
+  ASSIGN_COUNTER_2D(use_obmc_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(motion_mode_cdf, kNumMotionModes);
+  ASSIGN_COUNTER_2D(is_explicit_compound_type_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(is_compound_type_average_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(compound_type_cdf, kNumExplicitCompoundPredictionTypes);
+  ASSIGN_COUNTER_2D(interpolation_filter_cdf, kNumExplicitInterpolationFilters);
+  ASSIGN_COUNTER_2D(mv_joint_cdf, kNumMvJointTypes);
+  ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_2D
+
+#define ASSIGN_COUNTER_3D(array, offset) \
+  do {                                   \
+    for (auto& d1 : context.array) {     \
+      for (auto& d2 : d1) {              \
+        d2[libgav1::offset] = ++value;   \
+      }                                  \
+    }                                    \
+  } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters3d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  ASSIGN_COUNTER_3D(intra_frame_y_mode_cdf, kIntraPredictionModesY);
+  ASSIGN_COUNTER_3D(all_zero_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_16_cdf, kEobPt16SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_32_cdf, kEobPt32SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_64_cdf, kEobPt64SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_128_cdf, kEobPt128SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_256_cdf, kEobPt256SymbolCount);
+  ASSIGN_COUNTER_3D(dc_sign_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(has_palette_y_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(compound_backward_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(single_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_sign_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_class_cdf, kMvClassSymbolCount);
+  ASSIGN_COUNTER_3D(mv_class0_bit_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_class0_high_precision_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_fraction_cdf, kMvFractionSymbolCount);
+  ASSIGN_COUNTER_3D(mv_high_precision_cdf, kBooleanSymbolCount);
+  IncreasePartitionCounters(&context, value);
+  IncreaseTxTypeCounters(&context, value);
+  IncreaseTxDepthCounters(&context, value);
+  IncreaseUVModeCounters(&context, value);
+  ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_3D
+
+#define ASSIGN_COUNTER_4D(array, offset) \
+  do {                                   \
+    for (auto& d1 : context.array) {     \
+      for (auto& d2 : d1) {              \
+        for (auto& d3 : d2) {            \
+          d3[libgav1::offset] = ++value; \
+        }                                \
+      }                                  \
+    }                                    \
+  } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters4d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  ASSIGN_COUNTER_4D(eob_extra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_4D(coeff_base_eob_cdf, kCoeffBaseEobSymbolCount);
+  ASSIGN_COUNTER_4D(coeff_base_cdf, kCoeffBaseSymbolCount);
+  ASSIGN_COUNTER_4D(coeff_base_range_cdf, kCoeffBaseRangeSymbolCount);
+  ASSIGN_COUNTER_4D(compound_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_4D(mv_class0_fraction_cdf, kMvFractionSymbolCount);
+  ASSIGN_COUNTER_4D(mv_bit_cdf, kBooleanSymbolCount);
+  IncreasePaletteColorIndexCounters(&context, value);
+  IncreaseTxTypeCounters(&context, value);
+  ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_4D
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc
new file mode 100644 (file)
index 0000000..17ce18f
--- /dev/null
@@ -0,0 +1,223 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+namespace {
+
+#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
+constexpr int kFrameParallelThresholdMultiplier = 3;
+#else
+constexpr int kFrameParallelThresholdMultiplier =
+    LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
+#endif
+
+// Computes the number of frame threads to be used based on the following
+// heuristic:
+//   * If |thread_count| == 1, return 0.
+//   * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
+//     return 0.
+//   * Otherwise, return the largest value of i which satisfies the following
+//     condition: i + i * tile_columns <= thread_count. This ensures that there
+//     are at least |tile_columns| worker threads for each frame thread.
+//   * This function will never return 1 or a value > |thread_count|.
+//
+//  This heuristic is based on empirical performance data. The in-frame
+//  threading model (combination of tile multithreading, superblock row
+//  multithreading and post filter multithreading) performs better than the
+//  frame parallel model until we reach the threshold of |thread_count| >
+//  |tile_count| * kFrameParallelThresholdMultiplier.
+//
+//  It is a function of |tile_count| since tile threading and superblock row
+//  multithreading will scale only as a factor of |tile_count|. The threshold
+//  kFrameParallelThresholdMultiplier is arrived at based on empirical data.
+//  The general idea is that superblock row multithreading plateaus at 4 *
+//  |tile_count| because in most practical cases there aren't more than that
+//  many superblock rows and columns available to work on in parallel.
+int ComputeFrameThreadCount(int thread_count, int tile_count,
+                            int tile_columns) {
+  assert(thread_count > 0);
+  if (thread_count == 1) return 0;
+  return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
+             ? 0
+             : std::max(2, thread_count / (1 + tile_columns));
+}
+
+}  // namespace
+
+bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
+                              int thread_count) {
+  assert(thread_count > 0);
+  frame_parallel_ = false;
+
+  if (thread_count == 1) {
+    thread_pool_.reset(nullptr);
+    tile_thread_count_ = 0;
+    max_tile_index_for_row_threads_ = 0;
+    return true;
+  }
+
+  // We do work in the current thread, so it is sufficient to create
+  // |thread_count|-1 threads in the threadpool.
+  thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
+
+  if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+    thread_pool_ = ThreadPool::Create("libgav1", thread_count);
+    if (thread_pool_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+                   thread_count);
+      tile_thread_count_ = 0;
+      max_tile_index_for_row_threads_ = 0;
+      return false;
+    }
+  }
+
+  // Prefer tile threads first (but only if there is more than one tile).
+  const int tile_count = frame_header.tile_info.tile_count;
+  if (tile_count > 1) {
+    // We want 1 + tile_thread_count_ <= tile_count because the current thread
+    // is also used to decode tiles. This is equivalent to
+    // tile_thread_count_ <= tile_count - 1.
+    tile_thread_count_ = std::min(thread_count, tile_count - 1);
+    thread_count -= tile_thread_count_;
+    if (thread_count == 0) {
+      max_tile_index_for_row_threads_ = 0;
+      return true;
+    }
+  } else {
+    tile_thread_count_ = 0;
+  }
+
+#if defined(__ANDROID__)
+  // Assign the remaining threads for each Tile. The heuristic used here is that
+  // we will assign two threads for each Tile. So for example, if |thread_count|
+  // is 2, for a stream with 2 tiles the first tile would get both the threads
+  // and the second tile would have row multi-threading turned off. This
+  // heuristic is based on the fact that row multi-threading is fast enough only
+  // when there are at least two threads to do the decoding (since one thread
+  // always does the parsing).
+  //
+  // This heuristic might stop working when SIMD optimizations make the decoding
+  // much faster and the parsing thread is only as fast as the decoding threads.
+  // So we will have to revisit this later to make sure that this is still
+  // optimal.
+  //
+  // Note that while this heuristic significantly improves performance on high
+  // end devices (like the Pixel 3), there are some performance regressions in
+  // some lower end devices (in some cases) and that needs to be revisited as we
+  // bring in more optimizations. Overall, the gains because of this heuristic
+  // seems to be much larger than the regressions.
+  for (int i = 0; i < tile_count; ++i) {
+    max_tile_index_for_row_threads_ = i + 1;
+    thread_count -= 2;
+    if (thread_count <= 0) break;
+  }
+#else   // !defined(__ANDROID__)
+  // Assign the remaining threads to each Tile.
+  for (int i = 0; i < tile_count; ++i) {
+    const int count = thread_count / tile_count +
+                      static_cast<int>(i < thread_count % tile_count);
+    if (count == 0) {
+      // Once we see a 0 value, all subsequent values will be 0 since it is
+      // supposed to be assigned in a round-robin fashion.
+      break;
+    }
+    max_tile_index_for_row_threads_ = i + 1;
+  }
+#endif  // defined(__ANDROID__)
+  return true;
+}
+
+bool ThreadingStrategy::Reset(int thread_count) {
+  assert(thread_count > 0);
+  frame_parallel_ = true;
+
+  // In frame parallel mode, we simply access the underlying |thread_pool_|
+  // directly. So ensure all the other threadpool getter functions return
+  // nullptr. Also, superblock row multithreading is always disabled in frame
+  // parallel mode.
+  tile_thread_count_ = 0;
+  max_tile_index_for_row_threads_ = 0;
+
+  if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+    thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
+    if (thread_pool_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+                   thread_count);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool InitializeThreadPoolsForFrameParallel(
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* const frame_thread_pool,
+    FrameScratchBufferPool* const frame_scratch_buffer_pool) {
+  assert(*frame_thread_pool == nullptr);
+  thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
+  const int frame_threads =
+      ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
+  if (frame_threads == 0) return true;
+  *frame_thread_pool = ThreadPool::Create(frame_threads);
+  if (*frame_thread_pool == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
+                 frame_threads);
+    return false;
+  }
+  int remaining_threads = thread_count - frame_threads;
+  if (remaining_threads == 0) return true;
+  int threads_per_frame = remaining_threads / frame_threads;
+  const int extra_threads = remaining_threads % frame_threads;
+  Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  if (!frame_scratch_buffers.reserve(frame_threads)) return false;
+  // Create the tile thread pools.
+  for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
+    std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+        frame_scratch_buffer_pool->Get();
+    if (frame_scratch_buffer == nullptr) {
+      return false;
+    }
+    // If the number of tile threads cannot be divided equally amongst all the
+    // frame threads, assign one extra thread to the first |extra_threads| frame
+    // threads.
+    const int current_frame_thread_count =
+        threads_per_frame + static_cast<int>(i < extra_threads);
+    if (!frame_scratch_buffer->threading_strategy.Reset(
+            current_frame_thread_count)) {
+      return false;
+    }
+    remaining_threads -= current_frame_thread_count;
+    frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
+  }
+  // We release the frame scratch buffers in reverse order so that the extra
+  // threads are allocated to buffers in the top of the stack.
+  for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
+       --i) {
+    frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/threading_strategy.h b/src/threading_strategy.h
new file mode 100644 (file)
index 0000000..84b3589
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_THREADING_STRATEGY_H_
+#define LIBGAV1_SRC_THREADING_STRATEGY_H_
+
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+class FrameScratchBufferPool;
+
+// This class allocates and manages the worker threads among thread pools used
+// for multi-threaded decoding.
+class ThreadingStrategy {
+ public:
+  ThreadingStrategy() = default;
+
+  // Not copyable or movable.
+  ThreadingStrategy(const ThreadingStrategy&) = delete;
+  ThreadingStrategy& operator=(const ThreadingStrategy&) = delete;
+
+  // Creates or re-allocates the thread pools based on the |frame_header| and
+  // |thread_count|. This function is used only in non frame-parallel mode. This
+  // function is idempotent if the |frame_header| and |thread_count| don't
+  // change between calls (it will only create new threads on the first call and
+  // do nothing on the subsequent calls). This function also starts the worker
+  // threads whenever it creates new thread pools.
+  // The following strategy is used to allocate threads:
+  //   * One thread is allocated for decoding each Tile.
+  //   * Any remaining threads are allocated for superblock row multi-threading
+  //     within each of the tile in a round robin fashion.
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header,
+                                     int thread_count);
+
+  // Creates or re-allocates a thread pool with |thread_count| threads. This
+  // function is used only in frame parallel mode. This function is idempotent
+  // if the |thread_count| doesn't change between calls (it will only create new
+  // threads on the first call and do nothing on the subsequent calls).
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count);
+
+  // Returns a pointer to the ThreadPool that is to be used for Tile
+  // multi-threading.
+  ThreadPool* tile_thread_pool() const {
+    return (tile_thread_count_ != 0) ? thread_pool_.get() : nullptr;
+  }
+
+  int tile_thread_count() const { return tile_thread_count_; }
+
+  // Returns a pointer to the underlying ThreadPool.
+  // Note: Valid only when |frame_parallel_| is true. This is used for
+  // facilitating in-frame multi-threading in that case.
+  ThreadPool* thread_pool() const { return thread_pool_.get(); }
+
+  // Returns a pointer to the ThreadPool that is to be used within the Tile at
+  // index |tile_index| for superblock row multi-threading.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* row_thread_pool(int tile_index) const {
+    return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get()
+                                                        : nullptr;
+  }
+
+  // Returns a pointer to the ThreadPool that is to be used for post filter
+  // multi-threading.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* post_filter_thread_pool() const {
+    return frame_parallel_ ? nullptr : thread_pool_.get();
+  }
+
+  // Returns a pointer to the ThreadPool that is to be used for film grain
+  // synthesis and blending.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); }
+
+ private:
+  std::unique_ptr<ThreadPool> thread_pool_;
+  int tile_thread_count_ = 0;
+  int max_tile_index_for_row_threads_ = 0;
+  bool frame_parallel_ = false;
+};
+
+// Initializes the |frame_thread_pool| and the necessary worker threadpools (the
+// threading_strategy objects in each of the frame scratch buffer in
+// |frame_scratch_buffer_pool|) as follows:
+//  * frame_threads = ComputeFrameThreadCount();
+//  * For more details on how frame_threads is computed, see the function
+//    comment in ComputeFrameThreadCount().
+//  * |frame_thread_pool| is created with |frame_threads| threads.
+//  * divide the remaining number of threads into each frame thread and
+//    initialize a frame_scratch_buffer.threading_strategy for each frame
+//    thread.
+//  When this function is called, |frame_scratch_buffer_pool| must be empty. If
+//  this function returns true, it means the initialization was successful and
+//  one of the following is true:
+//    * |frame_thread_pool| has been successfully initialized and
+//      |frame_scratch_buffer_pool| has been successfully populated with
+//      |frame_threads| buffers to be used by each frame thread. The total
+//      number of threads that this function creates will always be equal to
+//      |thread_count|.
+//    * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not
+//      modified. This means that frame threading will not be used and the
+//      decoder will continue to operate normally in non frame parallel mode.
+LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel(
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* frame_thread_pool,
+    FrameScratchBufferPool* frame_scratch_buffer_pool);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_THREADING_STRATEGY_H_
diff --git a/src/threading_strategy_test.cc b/src/threading_strategy_test.cc
new file mode 100644 (file)
index 0000000..beea36f
--- /dev/null
@@ -0,0 +1,302 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+class ThreadingStrategyTest : public testing::Test {
+ protected:
+  ThreadingStrategy strategy_;
+  ObuFrameHeader frame_header_ = {};
+};
+
+TEST_F(ThreadingStrategyTest, MaxThreadEnforced) {
+  frame_header_.tile_info.tile_count = 32;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 32));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, UseAllThreadsForTiles) {
+  frame_header_.tile_info.tile_count = 8;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, RowThreads) {
+  frame_header_.tile_info.tile_count = 2;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  // Each tile should get 3 threads each.
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, RowThreadsUnequal) {
+  frame_header_.tile_info.tile_count = 2;
+
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 9));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(1), nullptr);
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+// Test a random combination of tile_count and thread_count.
+TEST_F(ThreadingStrategyTest, MultipleCalls) {
+  frame_header_.tile_info.tile_count = 2;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 8;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  // Row threads must have been reset.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 8;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 16));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 8; ++i) {
+    // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+    if (i >= 4) {
+      EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+      continue;
+    }
+#endif
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 4;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 16));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  // All the other row threads must be reset.
+  for (int i = 4; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 4;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 6));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  // First two tiles will get 1 thread each.
+  for (int i = 0; i < 2; ++i) {
+    // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+    if (i == 1) {
+      EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+      continue;
+    }
+#endif
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+  }
+  // All the other row threads must be reset.
+  for (int i = 2; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 1));
+  EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_EQ(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+// Tests the following order of calls (with thread count fixed at 4):
+//  * 1 Tile - 2 Tiles - 1 Tile.
+TEST_F(ThreadingStrategyTest, MultipleCalls2) {
+  frame_header_.tile_info.tile_count = 1;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+  // When there is only one tile, tile thread pool must be nullptr.
+  EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+  for (int i = 1; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 2;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 2; ++i) {
+    // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+    if (i == 1) {
+      EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+      continue;
+    }
+#endif
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  for (int i = 2; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 1;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+  EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+  for (int i = 1; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+void VerifyFrameParallel(int thread_count, int tile_count, int tile_columns,
+                         int expected_frame_threads,
+                         const std::vector<int>& expected_tile_threads) {
+  ASSERT_EQ(expected_frame_threads, expected_tile_threads.size());
+  ASSERT_GT(thread_count, 1);
+  std::unique_ptr<ThreadPool> frame_thread_pool;
+  FrameScratchBufferPool frame_scratch_buffer_pool;
+  ASSERT_TRUE(InitializeThreadPoolsForFrameParallel(
+      thread_count, tile_count, tile_columns, &frame_thread_pool,
+      &frame_scratch_buffer_pool));
+  if (expected_frame_threads == 0) {
+    EXPECT_EQ(frame_thread_pool, nullptr);
+    return;
+  }
+  EXPECT_NE(frame_thread_pool.get(), nullptr);
+  EXPECT_EQ(frame_thread_pool->num_threads(), expected_frame_threads);
+  std::vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  int actual_thread_count = frame_thread_pool->num_threads();
+  for (int i = 0; i < expected_frame_threads; ++i) {
+    SCOPED_TRACE(absl::StrCat("i: ", i));
+    frame_scratch_buffers.push_back(frame_scratch_buffer_pool.Get());
+    ThreadPool* const thread_pool =
+        frame_scratch_buffers.back()->threading_strategy.thread_pool();
+    if (expected_tile_threads[i] > 0) {
+      EXPECT_NE(thread_pool, nullptr);
+      EXPECT_EQ(thread_pool->num_threads(), expected_tile_threads[i]);
+      actual_thread_count += thread_pool->num_threads();
+    } else {
+      EXPECT_EQ(thread_pool, nullptr);
+    }
+  }
+  EXPECT_EQ(thread_count, actual_thread_count);
+  for (auto& frame_scratch_buffer : frame_scratch_buffers) {
+    frame_scratch_buffer_pool.Release(std::move(frame_scratch_buffer));
+  }
+}
+
+TEST(FrameParallelStrategyTest, FrameParallel) {
+  // This loop has thread_count <= 3 * tile count. So there should be no frame
+  // threads irrespective of the number of tile columns.
+  for (int thread_count = 2; thread_count <= 6; ++thread_count) {
+    VerifyFrameParallel(thread_count, /*tile_count=*/2, /*tile_columns=*/1,
+                        /*expected_frame_threads=*/0,
+                        /*expected_tile_threads=*/{});
+    VerifyFrameParallel(thread_count, /*tile_count=*/2, /*tile_columns=*/2,
+                        /*expected_frame_threads=*/0,
+                        /*expected_tile_threads=*/{});
+  }
+
+  // Equal number of tile threads for each frame thread.
+  VerifyFrameParallel(
+      /*thread_count=*/8, /*tile_count=*/1, /*tile_columns=*/1,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{1, 1, 1, 1});
+  VerifyFrameParallel(
+      /*thread_count=*/12, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{2, 2, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/18, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/6,
+      /*expected_tile_threads=*/{2, 2, 2, 2, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/16, /*tile_count=*/3, /*tile_columns=*/3,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{3, 3, 3, 3});
+
+  // Unequal number of tile threads for each frame thread.
+  VerifyFrameParallel(
+      /*thread_count=*/7, /*tile_count=*/1, /*tile_columns=*/1,
+      /*expected_frame_threads=*/3, /*expected_tile_threads=*/{2, 1, 1});
+  VerifyFrameParallel(
+      /*thread_count=*/14, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{3, 3, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/20, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/6,
+      /*expected_tile_threads=*/{3, 3, 2, 2, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/17, /*tile_count=*/3, /*tile_columns=*/3,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{4, 3, 3, 3});
+}
+
+TEST(FrameParallelStrategyTest, ThreadCountDoesNotExceedkMaxThreads) {
+  std::unique_ptr<ThreadPool> frame_thread_pool;
+  FrameScratchBufferPool frame_scratch_buffer_pool;
+  ASSERT_TRUE(InitializeThreadPoolsForFrameParallel(
+      /*thread_count=*/kMaxThreads + 10, /*tile_count=*/2, /*tile_columns=*/2,
+      &frame_thread_pool, &frame_scratch_buffer_pool));
+  EXPECT_NE(frame_thread_pool.get(), nullptr);
+  std::vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  int actual_thread_count = frame_thread_pool->num_threads();
+  for (int i = 0; i < frame_thread_pool->num_threads(); ++i) {
+    SCOPED_TRACE(absl::StrCat("i: ", i));
+    frame_scratch_buffers.push_back(frame_scratch_buffer_pool.Get());
+    ThreadPool* const thread_pool =
+        frame_scratch_buffers.back()->threading_strategy.thread_pool();
+    if (thread_pool != nullptr) {
+      actual_thread_count += thread_pool->num_threads();
+    }
+  }
+  // In this case, the exact number of frame threads and tile threads depend on
+  // the value of kMaxThreads. So simply ensure that the total number of threads
+  // does not exceed kMaxThreads.
+  EXPECT_LE(actual_thread_count, kMaxThreads);
+  for (auto& frame_scratch_buffer : frame_scratch_buffers) {
+    frame_scratch_buffer_pool.Release(std::move(frame_scratch_buffer));
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/tile.h b/src/tile.h
new file mode 100644 (file)
index 0000000..fcab963
--- /dev/null
@@ -0,0 +1,954 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_H_
+#define LIBGAV1_SRC_TILE_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Indicates what the ProcessSuperBlock() and TransformBlock() functions should
+// do. "Parse" refers to consuming the bitstream, reading the transform
+// coefficients and performing the dequantization. "Decode" refers to computing
+// the prediction, applying the inverse transforms and adding the residual.
+enum ProcessingMode {
+  kProcessingModeParseOnly,
+  kProcessingModeDecodeOnly,
+  kProcessingModeParseAndDecode,
+};
+
+// The alignment requirement is due to the SymbolDecoderContext member
+// symbol_decoder_context_.
+class Tile : public MaxAlignedAllocable {
+ public:
+  static std::unique_ptr<Tile> Create(
+      int tile_number, const uint8_t* const data, size_t size,
+      const ObuSequenceHeader& sequence_header,
+      const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
+      const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+      const WedgeMaskArray& wedge_masks,
+      const QuantizerMatrix& quantizer_matrix,
+      SymbolDecoderContext* const saved_symbol_decoder_context,
+      const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
+      const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
+      BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+      bool use_intra_prediction_buffer) {
+    std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
+        tile_number, data, size, sequence_header, frame_header, current_frame,
+        state, frame_scratch_buffer, wedge_masks, quantizer_matrix,
+        saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp,
+        thread_pool, pending_tiles, frame_parallel,
+        use_intra_prediction_buffer));
+    return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
+  }
+
+  // Move only.
+  Tile(Tile&& tile) noexcept;
+  Tile& operator=(Tile&& tile) noexcept;
+  Tile(const Tile&) = delete;
+  Tile& operator=(const Tile&) = delete;
+
+  struct Block;  // Defined after this class.
+
+  // Parses the entire tile.
+  bool Parse();
+  // Decodes the entire tile. |superblock_row_progress| and
+  // |superblock_row_progress_condvar| are arrays of size equal to the number of
+  // superblock rows in the frame. Increments |superblock_row_progress[i]| after
+  // each superblock row at index |i| is decoded. If the count reaches the
+  // number of tile columns, then it notifies
+  // |superblock_row_progress_condvar[i]|.
+  bool Decode(std::mutex* mutex, int* superblock_row_progress,
+              std::condition_variable* superblock_row_progress_condvar);
+  // Parses and decodes the entire tile. Depending on the configuration of this
+  // Tile, this function may do multithreaded decoding.
+  bool ParseAndDecode();  // 5.11.2.
+  // Processes all the columns of the superblock row at |row4x4| that are within
+  // this Tile. If |save_symbol_decoder_context| is true, then
+  // SaveSymbolDecoderContext() is invoked for the last superblock row.
+  template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+  bool ProcessSuperBlockRow(int row4x4, TileScratchBuffer* scratch_buffer);
+
+  const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+  const ObuFrameHeader& frame_header() const { return frame_header_; }
+  const RefCountedBuffer& current_frame() const { return current_frame_; }
+  const TemporalMotionField& motion_field() const { return motion_field_; }
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias()
+      const {
+    return reference_frame_sign_bias_;
+  }
+
+  bool IsRow4x4Inside(int row4x4) const {
+    return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_;
+  }
+
+  // 5.11.51.
+  bool IsInside(int row4x4, int column4x4) const {
+    return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ &&
+           column4x4 < column4x4_end_;
+  }
+
+  bool IsLeftInside(int column4x4) const {
+    // We use "larger than" as the condition. Don't pass in the left column
+    // offset column4x4 - 1.
+    assert(column4x4 <= column4x4_end_);
+    return column4x4 > column4x4_start_;
+  }
+
+  bool IsTopInside(int row4x4) const {
+    // We use "larger than" as the condition. Don't pass in the top row offset
+    // row4x4 - 1.
+    assert(row4x4 <= row4x4_end_);
+    return row4x4 > row4x4_start_;
+  }
+
+  bool IsTopLeftInside(int row4x4, int column4x4) const {
+    // We use "larger than" as the condition. Don't pass in the top row offset
+    // row4x4 - 1 or the left column offset column4x4 - 1.
+    assert(row4x4 <= row4x4_end_);
+    assert(column4x4 <= column4x4_end_);
+    return row4x4 > row4x4_start_ && column4x4 > column4x4_start_;
+  }
+
+  bool IsBottomRightInside(int row4x4, int column4x4) const {
+    assert(row4x4 >= row4x4_start_);
+    assert(column4x4 >= column4x4_start_);
+    return row4x4 < row4x4_end_ && column4x4 < column4x4_end_;
+  }
+
+  BlockParameters** BlockParametersAddress(int row4x4, int column4x4) const {
+    return block_parameters_holder_.Address(row4x4, column4x4);
+  }
+
+  int BlockParametersStride() const {
+    return block_parameters_holder_.columns4x4();
+  }
+
+  // Returns true if Parameters() can be called with |row| and |column| as
+  // inputs, false otherwise.
+  bool HasParameters(int row, int column) const {
+    return block_parameters_holder_.Find(row, column) != nullptr;
+  }
+  const BlockParameters& Parameters(int row, int column) const {
+    return *block_parameters_holder_.Find(row, column);
+  }
+
+  int number() const { return number_; }
+  int superblock_rows() const { return superblock_rows_; }
+  int superblock_columns() const { return superblock_columns_; }
+  int row4x4_start() const { return row4x4_start_; }
+  int column4x4_start() const { return column4x4_start_; }
+  int column4x4_end() const { return column4x4_end_; }
+
+ private:
+  // Stores the transform tree state when reading variable size transform trees
+  // and when applying the transform tree. When applying the transform tree,
+  // |depth| is not used.
+  struct TransformTreeNode {
+    // The default constructor is invoked by the Stack<TransformTreeNode, n>
+    // constructor. Stack<> does not use the default-constructed elements, so it
+    // is safe for the default constructor to not initialize the members.
+    TransformTreeNode() = default;
+    TransformTreeNode(int x, int y, TransformSize tx_size, int depth = -1)
+        : x(x), y(y), tx_size(tx_size), depth(depth) {}
+
+    int x;
+    int y;
+    TransformSize tx_size;
+    int depth;
+  };
+
+  // Enum to track the processing state of a superblock.
+  enum SuperBlockState : uint8_t {
+    kSuperBlockStateNone,       // Not yet parsed or decoded.
+    kSuperBlockStateParsed,     // Parsed but not yet decoded.
+    kSuperBlockStateScheduled,  // Scheduled for decoding.
+    kSuperBlockStateDecoded     // Parsed and decoded.
+  };
+
+  // Parameters used to facilitate multi-threading within the Tile.
+  struct ThreadingParameters {
+    std::mutex mutex;
+    // 2d array of size |superblock_rows_| by |superblock_columns_| containing
+    // the processing state of each superblock.
+    Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
+    // Variable used to indicate either parse or decode failure.
+    bool abort LIBGAV1_GUARDED_BY(mutex) = false;
+    int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0;
+    std::condition_variable pending_jobs_zero_condvar;
+  };
+
+  // The residual pointer is used to traverse the |residual_buffer_|. It is
+  // used in two different ways.
+  // If |split_parse_and_decode_| is true:
+  //    The pointer points to the beginning of the |residual_buffer_| when the
+  //    "parse" and "decode" steps begin. It is then moved forward tx_size in
+  //    each iteration of the "parse" and the "decode" steps. In this case, the
+  //    ResidualPtr variable passed into various functions starting from
+  //    ProcessSuperBlock is used as an in/out parameter to keep track of the
+  //    residual pointer.
+  // If |split_parse_and_decode_| is false:
+  //    The pointer is reset to the beginning of the |residual_buffer_| for
+  //    every transform block.
+  using ResidualPtr = uint8_t*;
+
+  Tile(int tile_number, const uint8_t* data, size_t size,
+       const ObuSequenceHeader& sequence_header,
+       const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
+       const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
+       const WedgeMaskArray& wedge_masks,
+       const QuantizerMatrix& quantizer_matrix,
+       SymbolDecoderContext* saved_symbol_decoder_context,
+       const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
+       const dsp::Dsp* dsp, ThreadPool* thread_pool,
+       BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+       bool use_intra_prediction_buffer);
+
+  // Performs member initializations that may fail. Helper function used by
+  // Create().
+  LIBGAV1_MUST_USE_RESULT bool Init();
+
+  // Saves the symbol decoder context of this tile into
+  // |saved_symbol_decoder_context_| if necessary.
+  void SaveSymbolDecoderContext();
+
+  // Entry point for multi-threaded decoding. This function performs the same
+  // functionality as ParseAndDecode(). The current thread does the "parse" step
+  // while the worker threads do the "decode" step.
+  bool ThreadedParseAndDecode();
+
+  // Returns whether or not the prerequisites for decoding the superblock at
+  // |row_index| and |column_index| are satisfied. |threading_.mutex| must be
+  // held when calling this function.
+  bool CanDecode(int row_index, int column_index) const;
+
+  // This function is run by the worker threads when multi-threaded decoding is
+  // enabled. Once a superblock is decoded, this function will set the
+  // corresponding |threading_.sb_state| entry to kSuperBlockStateDecoded. On
+  // failure, |threading_.abort| will be set to true. If at any point
+  // |threading_.abort| becomes true, this function will return as early as it
+  // can. If the decoding succeeds, this function will also schedule the
+  // decoding jobs for the superblock to the bottom-left and the superblock to
+  // the right of this superblock (if it is allowed).
+  void DecodeSuperBlock(int row_index, int column_index, int block_width4x4);
+
+  // If |use_intra_prediction_buffer_| is true, then this function copies the
+  // last row of the superblockrow starting at |row4x4| into the
+  // |intra_prediction_buffer_| (which may be used by the intra prediction
+  // process for the next superblock row).
+  void PopulateIntraPredictionBuffer(int row4x4);
+
+  uint16_t* GetPartitionCdf(int row4x4, int column4x4, BlockSize block_size);
+  bool ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+                     bool has_rows, bool has_columns, Partition* partition);
+  // Processes the Partition starting at |row4x4_start|, |column4x4_start|
+  // iteratively. It performs a DFS traversal over the partition tree to process
+  // the blocks in the right order.
+  bool ProcessPartition(
+      int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer,
+      ResidualPtr* residual);  // Iterative implementation of 5.11.4.
+  bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+                    TileScratchBuffer* scratch_buffer,
+                    ResidualPtr* residual);   // 5.11.5.
+  void ResetCdef(int row4x4, int column4x4);  // 5.11.55.
+
+  // This function is used to decode a superblock when the parsing has already
+  // been done for that superblock.
+  bool DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                        TileScratchBuffer* scratch_buffer);
+  // Helper function used by DecodeSuperBlock(). Note that the decode_block()
+  // function in the spec is equivalent to ProcessBlock() in the code.
+  bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+                   TileScratchBuffer* scratch_buffer, ResidualPtr* residual);
+
+  void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
+                         int column4x4);  // 5.11.3.
+  bool ProcessSuperBlock(int row4x4, int column4x4,
+                         TileScratchBuffer* scratch_buffer,
+                         ProcessingMode mode);
+  void ResetLoopRestorationParams();
+  void ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+                                       BlockSize block_size);  // 5.11.57.
+
+  // Helper functions for DecodeBlock.
+  bool ReadSegmentId(const Block& block);       // 5.11.9.
+  bool ReadIntraSegmentId(const Block& block);  // 5.11.8.
+  void ReadSkip(const Block& block);            // 5.11.11.
+  bool ReadSkipMode(const Block& block);        // 5.11.10.
+  void ReadCdef(const Block& block);            // 5.11.56.
+  // Returns the new value. |cdf| is an array of size kDeltaSymbolCount + 1.
+  int ReadAndClipDelta(uint16_t* cdf, int delta_small, int scale, int min_value,
+                       int max_value, int value);
+  void ReadQuantizerIndexDelta(const Block& block);  // 5.11.12.
+  void ReadLoopFilterDelta(const Block& block);      // 5.11.13.
+  // Populates |BlockParameters::deblock_filter_level| for the given |block|
+  // using |deblock_filter_levels_|.
+  void PopulateDeblockFilterLevel(const Block& block);
+  void PopulateCdefSkip(const Block& block);
+  void ReadPredictionModeY(const Block& block, bool intra_y_mode);
+  void ReadIntraAngleInfo(const Block& block,
+                          PlaneType plane_type);  // 5.11.42 and 5.11.43.
+  void ReadPredictionModeUV(const Block& block);
+  void ReadCflAlpha(const Block& block);  // 5.11.45.
+  int GetPaletteCache(const Block& block, PlaneType plane_type,
+                      uint16_t* cache);
+  void ReadPaletteColors(const Block& block, Plane plane);
+  void ReadPaletteModeInfo(const Block& block);      // 5.11.46.
+  void ReadFilterIntraModeInfo(const Block& block);  // 5.11.24.
+  int ReadMotionVectorComponent(const Block& block,
+                                int component);                // 5.11.32.
+  void ReadMotionVector(const Block& block, int index);        // 5.11.31.
+  bool DecodeIntraModeInfo(const Block& block);                // 5.11.7.
+  int8_t ComputePredictedSegmentId(const Block& block) const;  // 5.11.21.
+  bool ReadInterSegmentId(const Block& block, bool pre_skip);  // 5.11.19.
+  void ReadIsInter(const Block& block, bool skip_mode);        // 5.11.20.
+  bool ReadIntraBlockModeInfo(const Block& block,
+                              bool intra_y_mode);  // 5.11.22.
+  CompoundReferenceType ReadCompoundReferenceType(const Block& block);
+  template <bool is_single, bool is_backward, int index>
+  uint16_t* GetReferenceCdf(const Block& block, CompoundReferenceType type =
+                                                    kNumCompoundReferenceTypes);
+  void ReadReferenceFrames(const Block& block, bool skip_mode);  // 5.11.25.
+  void ReadInterPredictionModeY(const Block& block,
+                                const MvContexts& mode_contexts,
+                                bool skip_mode);
+  void ReadRefMvIndex(const Block& block);
+  void ReadInterIntraMode(const Block& block, bool is_compound,
+                          bool skip_mode);        // 5.11.28.
+  bool IsScaled(ReferenceFrameType type) const {  // Part of 5.11.27.
+    const int index =
+        frame_header_.reference_frame_index[type - kReferenceFrameLast];
+    return reference_frames_[index]->upscaled_width() != frame_header_.width ||
+           reference_frames_[index]->frame_height() != frame_header_.height;
+  }
+  void ReadMotionMode(const Block& block, bool is_compound,
+                      bool skip_mode);  // 5.11.27.
+  uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block);
+  uint16_t* GetIsCompoundTypeAverageCdf(const Block& block);
+  void ReadCompoundType(const Block& block, bool is_compound, bool skip_mode,
+                        bool* is_explicit_compound_type,
+                        bool* is_compound_type_average);  // 5.11.29.
+  uint16_t* GetInterpolationFilterCdf(const Block& block, int direction);
+  void ReadInterpolationFilter(const Block& block, bool skip_mode);
+  bool ReadInterBlockModeInfo(const Block& block, bool skip_mode);  // 5.11.23.
+  bool DecodeInterModeInfo(const Block& block);                     // 5.11.18.
+  bool DecodeModeInfo(const Block& block);                          // 5.11.6.
+  bool IsMvValid(const Block& block, bool is_compound) const;       // 6.10.25.
+  bool AssignInterMv(const Block& block, bool is_compound);         // 5.11.26.
+  bool AssignIntraMv(const Block& block);                           // 5.11.26.
+  int GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+                           bool ignore_skip);
+  int GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+                             bool ignore_skip);
+  TransformSize ReadFixedTransformSize(const Block& block);  // 5.11.15.
+  // Iterative implementation of 5.11.17.
+  void ReadVariableTransformTree(const Block& block, int row4x4, int column4x4,
+                                 TransformSize tx_size);
+  void DecodeTransformSize(const Block& block);  // 5.11.16.
+  bool ComputePrediction(const Block& block);    // 5.11.33.
+  // |x4| and |y4| are the column and row positions of the 4x4 block. |w4| and
+  // |h4| are the width and height in 4x4 units of |tx_size|.
+  int GetTransformAllZeroContext(const Block& block, Plane plane,
+                                 TransformSize tx_size, int x4, int y4, int w4,
+                                 int h4);
+  TransformSet GetTransformSet(TransformSize tx_size,
+                               bool is_inter) const;  // 5.11.48.
+  TransformType ComputeTransformType(const Block& block, Plane plane,
+                                     TransformSize tx_size, int block_x,
+                                     int block_y);  // 5.11.40.
+  void ReadTransformType(const Block& block, int x4, int y4,
+                         TransformSize tx_size);  // 5.11.47.
+  template <typename ResidualType>
+  void ReadCoeffBase2D(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  template <typename ResidualType>
+  void ReadCoeffBaseHorizontal(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  template <typename ResidualType>
+  void ReadCoeffBaseVertical(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane);
+  void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+                          uint8_t coefficient_level, int8_t dc_category);
+  void InterIntraPrediction(
+      uint16_t* prediction_0, const uint8_t* prediction_mask,
+      ptrdiff_t prediction_mask_stride,
+      const PredictionParameters& prediction_parameters, int prediction_width,
+      int prediction_height, int subsampling_x, int subsampling_y,
+      uint8_t* dest,
+      ptrdiff_t dest_stride);  // Part of section 7.11.3.1 in the spec.
+  void CompoundInterPrediction(
+      const Block& block, const uint8_t* prediction_mask,
+      ptrdiff_t prediction_mask_stride, int prediction_width,
+      int prediction_height, int subsampling_x, int subsampling_y,
+      int candidate_row, int candidate_column, uint8_t* dest,
+      ptrdiff_t dest_stride);  // Part of section 7.11.3.1 in the spec.
+  GlobalMotion* GetWarpParams(const Block& block, Plane plane,
+                              int prediction_width, int prediction_height,
+                              const PredictionParameters& prediction_parameters,
+                              ReferenceFrameType reference_type,
+                              bool* is_local_valid,
+                              GlobalMotion* global_motion_params,
+                              GlobalMotion* local_warp_params)
+      const;  // Part of section 7.11.3.1 in the spec.
+  bool InterPrediction(const Block& block, Plane plane, int x, int y,
+                       int prediction_width, int prediction_height,
+                       int candidate_row, int candidate_column,
+                       bool* is_local_valid,
+                       GlobalMotion* local_warp_params);  // 7.11.3.1.
+  void ScaleMotionVector(const MotionVector& mv, Plane plane,
+                         int reference_frame_index, int x, int y, int* start_x,
+                         int* start_y, int* step_x, int* step_y);  // 7.11.3.3.
+  // If the method returns false, the caller only uses the output parameters
+  // *ref_block_start_x and *ref_block_start_y. If the method returns true, the
+  // caller uses all four output parameters.
+  static bool GetReferenceBlockPosition(
+      int reference_frame_index, bool is_scaled, int width, int height,
+      int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y,
+      int start_x, int start_y, int step_x, int step_y, int left_border,
+      int right_border, int top_border, int bottom_border,
+      int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x,
+      int* ref_block_end_y);
+
+  template <typename Pixel>
+  void BuildConvolveBlock(Plane plane, int reference_frame_index,
+                          bool is_scaled, int height, int ref_start_x,
+                          int ref_last_x, int ref_start_y, int ref_last_y,
+                          int step_y, int ref_block_start_x,
+                          int ref_block_end_x, int ref_block_start_y,
+                          uint8_t* block_buffer,
+                          ptrdiff_t convolve_buffer_stride,
+                          ptrdiff_t block_extended_width);
+  bool BlockInterPrediction(const Block& block, Plane plane,
+                            int reference_frame_index, const MotionVector& mv,
+                            int x, int y, int width, int height,
+                            int candidate_row, int candidate_column,
+                            uint16_t* prediction, bool is_compound,
+                            bool is_inter_intra, uint8_t* dest,
+                            ptrdiff_t dest_stride);  // 7.11.3.4.
+  bool BlockWarpProcess(const Block& block, Plane plane, int index,
+                        int block_start_x, int block_start_y, int width,
+                        int height, GlobalMotion* warp_params, bool is_compound,
+                        bool is_inter_intra, uint8_t* dest,
+                        ptrdiff_t dest_stride);  // 7.11.3.5.
+  bool ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+                           Plane plane, int reference_frame_index, int width,
+                           int height, int x, int y, int candidate_row,
+                           int candidate_column,
+                           ObmcDirection blending_direction);
+  bool ObmcPrediction(const Block& block, Plane plane, int width,
+                      int height);  // 7.11.3.9.
+  void DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+                                  int width, int height, int candidate_row,
+                                  int candidate_column, uint8_t* dest,
+                                  ptrdiff_t dest_stride);  // 7.11.3.15.
+  // This function specializes the parsing of DC coefficient by removing some of
+  // the branches when i == 0 (since scan[0] is always 0 and scan[i] is always
+  // non-zero for all other possible values of i). |dc_category| is an output
+  // parameter that is populated when |is_dc_coefficient| is true.
+  // |coefficient_level| is an output parameter which accumulates the
+  // coefficient level.
+  template <typename ResidualType, bool is_dc_coefficient>
+  LIBGAV1_ALWAYS_INLINE bool ReadSignAndApplyDequantization(
+      const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix,
+      int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category,
+      int* coefficient_level,
+      ResidualType* residual_buffer);     // Part of 5.11.39.
+  int ReadCoeffBaseRange(uint16_t* cdf);  // Part of 5.11.39.
+  // Returns the number of non-zero coefficients that were read. |tx_type| is an
+  // output parameter that stores the computed transform type for the plane
+  // whose coefficients were read. Returns -1 on failure.
+  template <typename ResidualType>
+  int ReadTransformCoefficients(const Block& block, Plane plane, int start_x,
+                                int start_y, TransformSize tx_size,
+                                TransformType* tx_type);  // 5.11.39.
+  bool TransformBlock(const Block& block, Plane plane, int base_x, int base_y,
+                      TransformSize tx_size, int x, int y,
+                      ProcessingMode mode);  // 5.11.35.
+  // Iterative implementation of 5.11.36.
+  bool TransformTree(const Block& block, int start_x, int start_y,
+                     BlockSize plane_size, ProcessingMode mode);
+  void ReconstructBlock(const Block& block, Plane plane, int start_x,
+                        int start_y, TransformSize tx_size,
+                        TransformType tx_type,
+                        int non_zero_coeff_count);         // Part of 7.12.3.
+  bool Residual(const Block& block, ProcessingMode mode);  // 5.11.34.
+  // part of 5.11.5 (reset_block_context() in the spec).
+  void ResetEntropyContext(const Block& block);
+  // Populates the |color_context| and |color_order| for the |i|th iteration
+  // with entries counting down from |start| to |end| (|start| > |end|).
+  void PopulatePaletteColorContexts(
+      const Block& block, PlaneType plane_type, int i, int start, int end,
+      uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+      uint8_t color_context[kMaxPaletteSquare]);  // 5.11.50.
+  bool ReadPaletteTokens(const Block& block);     // 5.11.49.
+  template <typename Pixel>
+  void IntraPrediction(const Block& block, Plane plane, int x, int y,
+                       bool has_left, bool has_top, bool has_top_right,
+                       bool has_bottom_left, PredictionMode mode,
+                       TransformSize tx_size);
+  int GetIntraEdgeFilterType(const Block& block,
+                             Plane plane) const;  // 7.11.2.8.
+  template <typename Pixel>
+  void DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+                             bool has_left, bool has_top, bool needs_left,
+                             bool needs_top, int prediction_angle, int width,
+                             int height, int max_x, int max_y,
+                             TransformSize tx_size, Pixel* top_row,
+                             Pixel* left_column);  // 7.11.2.4.
+  template <typename Pixel>
+  void PalettePrediction(const Block& block, Plane plane, int start_x,
+                         int start_y, int x, int y,
+                         TransformSize tx_size);  // 7.11.4.
+  template <typename Pixel>
+  void ChromaFromLumaPrediction(const Block& block, Plane plane, int start_x,
+                                int start_y,
+                                TransformSize tx_size);  // 7.11.5.
+  // Section 7.19. Applies some filtering and reordering to the motion vectors
+  // for the given |block| and stores them into |current_frame_|.
+  void StoreMotionFieldMvsIntoCurrentFrame(const Block& block);
+
+  // SetCdfContext*() functions will populate the |left_context_| and
+  // |top_context_| for the |block|.
+  void SetCdfContextUsePredictedSegmentId(const Block& block,
+                                          bool use_predicted_segment_id);
+  void SetCdfContextCompoundType(const Block& block,
+                                 bool is_explicit_compound_type,
+                                 bool is_compound_type_average);
+  void SetCdfContextSkipMode(const Block& block, bool skip_mode);
+  void SetCdfContextPaletteSize(const Block& block);
+  void SetCdfContextUVMode(const Block& block);
+
+  // Returns the zero-based index of the super block that contains |row4x4|
+  // relative to the start of this tile.
+  int SuperBlockRowIndex(int row4x4) const {
+    return (row4x4 - row4x4_start_) >>
+           (sequence_header_.use_128x128_superblock ? 5 : 4);
+  }
+
+  // Returns the zero-based index of the super block that contains |column4x4|
+  // relative to the start of this tile.
+  int SuperBlockColumnIndex(int column4x4) const {
+    return (column4x4 - column4x4_start_) >>
+           (sequence_header_.use_128x128_superblock ? 5 : 4);
+  }
+
+  // Returns the zero-based index of the block that starts at row4x4 or
+  // column4x4 relative to the start of the superblock that contains the block.
+  // This is used to index into the members of |left_context_| and
+  // |top_context_|.
+  int CdfContextIndex(int row_or_column4x4) const {
+    return row_or_column4x4 -
+           (row_or_column4x4 &
+            (sequence_header_.use_128x128_superblock ? ~31 : ~15));
+  }
+
+  BlockSize SuperBlockSize() const {
+    return sequence_header_.use_128x128_superblock ? kBlock128x128
+                                                   : kBlock64x64;
+  }
+  int PlaneCount() const {
+    return sequence_header_.color_config.is_monochrome ? kMaxPlanesMonochrome
+                                                       : kMaxPlanes;
+  }
+
+  const int number_;
+  const int row_;
+  const int column_;
+  const uint8_t* const data_;
+  size_t size_;
+  int row4x4_start_;
+  int row4x4_end_;
+  int column4x4_start_;
+  int column4x4_end_;
+  int superblock_rows_;
+  int superblock_columns_;
+  bool read_deltas_;
+  const int8_t subsampling_x_[kMaxPlanes];
+  const int8_t subsampling_y_[kMaxPlanes];
+
+  // The dimensions (in order) are: segment_id, level_index (based on plane and
+  // direction), reference_frame and mode_id.
+  uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+                                [kNumReferenceFrameTypes][2];
+
+  // current_quantizer_index_ is in the range [0, 255].
+  uint8_t current_quantizer_index_;
+  // These two arrays (|coefficient_levels_| and |dc_categories_|) are used to
+  // store the entropy context. Their dimensions are as follows: First -
+  // left/top; Second - plane; Third - row4x4 (if first dimension is
+  // left)/column4x4 (if first dimension is top).
+  //
+  // This is equivalent to the LeftLevelContext and AboveLevelContext arrays in
+  // the spec. In the spec, it stores values from 0 through 63 (inclusive). The
+  // stored values are used to compute the left and top contexts in
+  // GetTransformAllZeroContext. In that function, we only care about the
+  // following values: 0, 1, 2, 3 and >= 4. So instead of clamping to 63, we
+  // clamp to 4 (i.e.) all the values greater than 4 are stored as 4.
+  std::array<Array2D<uint8_t>, 2> coefficient_levels_;
+  // This is equivalent to the LeftDcContext and AboveDcContext arrays in the
+  // spec. In the spec, it can store 3 possible values: 0, 1 and 2 (where 1
+  // means the value is < 0, 2 means the value is > 0 and 0 means the value is
+  // equal to 0).
+  //
+  // The stored values are used in two places:
+  //  * GetTransformAllZeroContext: Here, we only care about whether the
+  //  value is 0 or not (whether it is 1 or 2 is irrelevant).
+  //  * GetDcSignContext: Here, we do the following computation: if the
+  //  stored value is 1, we decrement a counter. If the stored value is 2
+  //  we increment a counter.
+  //
+  // Based on this usage, we can simply replace 1 with -1 and 2 with 1 and
+  // use that value to compute the counter.
+  //
+  // The usage on GetTransformAllZeroContext is unaffected since there we
+  // only care about whether it is 0 or not.
+  std::array<Array2D<int8_t>, 2> dc_categories_;
+  const ObuSequenceHeader& sequence_header_;
+  const ObuFrameHeader& frame_header_;
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias_;
+  const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+      reference_frames_;
+  TemporalMotionField& motion_field_;
+  const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint_;
+  const WedgeMaskArray& wedge_masks_;
+  const QuantizerMatrix& quantizer_matrix_;
+  EntropyDecoder reader_;
+  SymbolDecoderContext symbol_decoder_context_;
+  SymbolDecoderContext* const saved_symbol_decoder_context_;
+  const SegmentationMap* prev_segment_ids_;
+  const dsp::Dsp& dsp_;
+  PostFilter& post_filter_;
+  BlockParametersHolder& block_parameters_holder_;
+  Quantizer quantizer_;
+  // When there is no multi-threading within the Tile, |residual_buffer_| is
+  // used. When there is multi-threading within the Tile,
+  // |residual_buffer_threaded_| is used. In the following comment,
+  // |residual_buffer| refers to either |residual_buffer_| or
+  // |residual_buffer_threaded_| depending on whether multi-threading is enabled
+  // within the Tile or not.
+  // The |residual_buffer| is used to help with the dequantization and the
+  // inverse transform processes. It is declared as a uint8_t, but is always
+  // accessed either as an int16_t or int32_t depending on |bitdepth|. Here is
+  // what it stores at various stages of the decoding process (in the order
+  // which they happen):
+  //   1) In ReadTransformCoefficients(), this buffer is used to store the
+  //   dequantized values.
+  //   2) In Reconstruct(), this buffer is used as the input to the row
+  //   transform process.
+  // The size of this buffer would be:
+  //    For |residual_buffer_|: (4096 + 32 * |kResidualPaddingVertical|) *
+  //        |residual_size_|. Where 4096 = 64x64 which is the maximum transform
+  //        size, and 32 * |kResidualPaddingVertical| is the padding to avoid
+  //        bottom boundary checks when parsing quantized coefficients. This
+  //        memory is allocated and owned by the Tile class.
+  //    For |residual_buffer_threaded_|: See the comment below. This memory is
+  //        not allocated or owned by the Tile class.
+  AlignedUniquePtr<uint8_t> residual_buffer_;
+  // This is a 2d array of pointers of size |superblock_rows_| by
+  // |superblock_columns_| where each pointer points to a ResidualBuffer for a
+  // single super block. The array is populated when the parsing process begins
+  // by calling |residual_buffer_pool_->Get()| and the memory is released back
+  // to the pool by calling |residual_buffer_pool_->Release()| when the decoding
+  // process is complete.
+  Array2D<std::unique_ptr<ResidualBuffer>> residual_buffer_threaded_;
+  // sizeof(int16_t or int32_t) depending on |bitdepth|.
+  const size_t residual_size_;
+  // Number of superblocks on the top-right that will have to be decoded before
+  // the current superblock can be decoded. This will be 1 if allow_intrabc is
+  // false. If allow_intrabc is true, then this value will be
+  // use_128x128_superblock ? 3 : 5. This is the allowed range of reference for
+  // the top rows for intrabc.
+  const int intra_block_copy_lag_;
+
+  // In the Tile class, we use the "current_frame" in two ways:
+  //   1) To write the decoded output into (using the |buffer_| view).
+  //   2) To read the pixels for intra block copy (using the |current_frame_|
+  //      reference).
+  //
+  // When intra block copy is off, |buffer_| and |current_frame_| may or may not
+  // point to the same plane pointers. But it is okay since |current_frame_| is
+  // never used in this case.
+  //
+  // When intra block copy is on, |buffer_| and |current_frame_| always point to
+  // the same plane pointers (since post filtering is disabled). So the usage in
+  // both case 1 and case 2 remain valid.
+  Array2DView<uint8_t> buffer_[kMaxPlanes];
+  RefCountedBuffer& current_frame_;
+
+  Array2D<int8_t>& cdef_index_;
+  Array2D<uint8_t>& cdef_skip_;
+  Array2D<TransformSize>& inter_transform_sizes_;
+  std::array<RestorationUnitInfo, kMaxPlanes> reference_unit_info_;
+  // If |thread_pool_| is nullptr, the calling thread will do the parsing and
+  // the decoding in one pass. If |thread_pool_| is not nullptr, then the main
+  // thread will do the parsing while the thread pool workers will do the
+  // decoding.
+  ThreadPool* const thread_pool_;
+  ThreadingParameters threading_;
+  ResidualBufferPool* const residual_buffer_pool_;
+  TileScratchBufferPool* const tile_scratch_buffer_pool_;
+  BlockingCounterWithStatus* const pending_tiles_;
+  bool split_parse_and_decode_;
+  // This is used only when |split_parse_and_decode_| is false.
+  std::unique_ptr<PredictionParameters> prediction_parameters_ = nullptr;
+  // Stores the |transform_type| for the super block being decoded at a 4x4
+  // granularity. The spec uses absolute indices for this array but it is
+  // sufficient to use indices relative to the super block being decoded.
+  TransformType transform_types_[32][32];
+  // delta_lf_[i] is in the range [-63, 63].
+  int8_t delta_lf_[kFrameLfCount];
+  // True if all the values in |delta_lf_| are zero. False otherwise.
+  bool delta_lf_all_zero_;
+  const bool frame_parallel_;
+  const bool use_intra_prediction_buffer_;
+  // Buffer used to store the unfiltered pixels that are necessary for decoding
+  // the next superblock row (for the intra prediction process). Used only if
+  // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains
+  // one row buffer for each tile row. This tile will have to use the buffer
+  // corresponding to this tile's row.
+  IntraPredictionBuffer* const intra_prediction_buffer_;
+  // Stores the progress of the reference frames. This will be used to avoid
+  // unnecessary calls into RefCountedBuffer::WaitUntil().
+  std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_;
+  // Stores the CDF contexts necessary for the "left" block.
+  BlockCdfContext left_context_;
+  // Stores the CDF contexts necessary for the "top" block. The size of this
+  // buffer is the number of superblock columns in this tile. For each block,
+  // the access index will be the corresponding SuperBlockColumnIndex()'th
+  // entry.
+  DynamicBuffer<BlockCdfContext> top_context_;
+};
+
+struct Tile::Block {
+  Block(Tile* tile_ptr, BlockSize size, int row4x4, int column4x4,
+        TileScratchBuffer* const scratch_buffer, ResidualPtr* residual)
+      : tile(*tile_ptr),
+        size(size),
+        row4x4(row4x4),
+        column4x4(column4x4),
+        width(kBlockWidthPixels[size]),
+        height(kBlockHeightPixels[size]),
+        width4x4(width >> 2),
+        height4x4(height >> 2),
+        scratch_buffer(scratch_buffer),
+        residual(residual),
+        top_context(tile.top_context_.get() +
+                    tile.SuperBlockColumnIndex(column4x4)),
+        top_context_index(tile.CdfContextIndex(column4x4)),
+        left_context_index(tile.CdfContextIndex(row4x4)) {
+    assert(size != kBlockInvalid);
+    residual_size[kPlaneY] = kPlaneResidualSize[size][0][0];
+    residual_size[kPlaneU] = residual_size[kPlaneV] =
+        kPlaneResidualSize[size][tile.subsampling_x_[kPlaneU]]
+                          [tile.subsampling_y_[kPlaneU]];
+    assert(residual_size[kPlaneY] != kBlockInvalid);
+    if (tile.PlaneCount() > 1) {
+      assert(residual_size[kPlaneU] != kBlockInvalid);
+    }
+    if ((row4x4 & 1) == 0 &&
+        (tile.sequence_header_.color_config.subsampling_y & height4x4) == 1) {
+      has_chroma = false;
+    } else if ((column4x4 & 1) == 0 &&
+               (tile.sequence_header_.color_config.subsampling_x & width4x4) ==
+                   1) {
+      has_chroma = false;
+    } else {
+      has_chroma = !tile.sequence_header_.color_config.is_monochrome;
+    }
+    top_available[kPlaneY] = tile.IsTopInside(row4x4);
+    left_available[kPlaneY] = tile.IsLeftInside(column4x4);
+    if (has_chroma) {
+      // top_available[kPlaneU] and top_available[kPlaneV] are valid only if
+      // has_chroma is true.
+      // The next 3 lines are equivalent to:
+      // top_available[kPlaneU] = top_available[kPlaneV] =
+      //     top_available[kPlaneY] &&
+      //     ((tile.sequence_header_.color_config.subsampling_y & height4x4) ==
+      //     0 || tile.IsTopInside(row4x4 - 1));
+      top_available[kPlaneU] = top_available[kPlaneV] = tile.IsTopInside(
+          row4x4 -
+          (tile.sequence_header_.color_config.subsampling_y & height4x4));
+      // left_available[kPlaneU] and left_available[kPlaneV] are valid only if
+      // has_chroma is true.
+      // The next 3 lines are equivalent to:
+      // left_available[kPlaneU] = left_available[kPlaneV] =
+      //     left_available[kPlaneY] &&
+      //     ((tile.sequence_header_.color_config.subsampling_x & width4x4) == 0
+      //      || tile.IsLeftInside(column4x4 - 1));
+      left_available[kPlaneU] = left_available[kPlaneV] = tile.IsLeftInside(
+          column4x4 -
+          (tile.sequence_header_.color_config.subsampling_x & width4x4));
+    }
+    const ptrdiff_t stride = tile.BlockParametersStride();
+    BlockParameters** const bps =
+        tile.BlockParametersAddress(row4x4, column4x4);
+    bp = *bps;
+    // bp_top is valid only if top_available[kPlaneY] is true.
+    if (top_available[kPlaneY]) {
+      bp_top = *(bps - stride);
+    }
+    // bp_left is valid only if left_available[kPlaneY] is true.
+    if (left_available[kPlaneY]) {
+      bp_left = *(bps - 1);
+    }
+  }
+
+  bool HasChroma() const { return has_chroma; }
+
+  // These return values of these group of functions are valid only if the
+  // corresponding top_available or left_available is true.
+  ReferenceFrameType TopReference(int index) const {
+    return bp_top->reference_frame[index];
+  }
+
+  ReferenceFrameType LeftReference(int index) const {
+    return bp_left->reference_frame[index];
+  }
+
+  bool IsTopIntra() const { return TopReference(0) <= kReferenceFrameIntra; }
+  bool IsLeftIntra() const { return LeftReference(0) <= kReferenceFrameIntra; }
+
+  bool IsTopSingle() const { return TopReference(1) <= kReferenceFrameIntra; }
+  bool IsLeftSingle() const { return LeftReference(1) <= kReferenceFrameIntra; }
+
+  int CountReferences(ReferenceFrameType type) const {
+    return static_cast<int>(top_available[kPlaneY] &&
+                            bp_top->reference_frame[0] == type) +
+           static_cast<int>(top_available[kPlaneY] &&
+                            bp_top->reference_frame[1] == type) +
+           static_cast<int>(left_available[kPlaneY] &&
+                            bp_left->reference_frame[0] == type) +
+           static_cast<int>(left_available[kPlaneY] &&
+                            bp_left->reference_frame[1] == type);
+  }
+
+  // 7.10.3.
+  // Checks if there are any inter blocks to the left or above. If so, it
+  // returns true indicating that the block has neighbors that are suitable for
+  // use by overlapped motion compensation.
+  bool HasOverlappableCandidates() const {
+    const ptrdiff_t stride = tile.BlockParametersStride();
+    BlockParameters** const bps = tile.BlockParametersAddress(0, 0);
+    if (top_available[kPlaneY]) {
+      BlockParameters** bps_top = bps + (row4x4 - 1) * stride + (column4x4 | 1);
+      const int columns = std::min(tile.frame_header_.columns4x4 - column4x4,
+                                   static_cast<int>(width4x4));
+      BlockParameters** const bps_top_end = bps_top + columns;
+      do {
+        if ((*bps_top)->reference_frame[0] > kReferenceFrameIntra) {
+          return true;
+        }
+        bps_top += 2;
+      } while (bps_top < bps_top_end);
+    }
+    if (left_available[kPlaneY]) {
+      BlockParameters** bps_left = bps + (row4x4 | 1) * stride + column4x4 - 1;
+      const int rows = std::min(tile.frame_header_.rows4x4 - row4x4,
+                                static_cast<int>(height4x4));
+      BlockParameters** const bps_left_end = bps_left + rows * stride;
+      do {
+        if ((*bps_left)->reference_frame[0] > kReferenceFrameIntra) {
+          return true;
+        }
+        bps_left += 2 * stride;
+      } while (bps_left < bps_left_end);
+    }
+    return false;
+  }
+
+  Tile& tile;
+  bool has_chroma;
+  const BlockSize size;
+  bool top_available[kMaxPlanes];
+  bool left_available[kMaxPlanes];
+  BlockSize residual_size[kMaxPlanes];
+  const int row4x4;
+  const int column4x4;
+  const int width;
+  const int height;
+  const int width4x4;
+  const int height4x4;
+  const BlockParameters* bp_top;
+  const BlockParameters* bp_left;
+  BlockParameters* bp;
+  TileScratchBuffer* const scratch_buffer;
+  ResidualPtr* const residual;
+  BlockCdfContext* const top_context;
+  const int top_context_index;
+  const int left_context_index;
+};
+
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_TILE_H_
diff --git a/src/tile/bitstream/mode_info.cc b/src/tile/bitstream/mode_info.cc
new file mode 100644 (file)
index 0000000..ffbbf64
--- /dev/null
@@ -0,0 +1,1446 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kDeltaQSmall = 3;
+constexpr int kDeltaLfSmall = 3;
+
+constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = {
+    0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0};
+
+constexpr uint8_t kSizeGroup[kMaxBlockSizes] = {
+    0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 3, 3};
+
+constexpr int kCompoundModeNewMvContexts = 5;
+constexpr uint8_t kCompoundModeContextMap[3][kCompoundModeNewMvContexts] = {
+    {0, 1, 1, 1, 1}, {1, 2, 3, 4, 4}, {4, 4, 5, 6, 7}};
+
+enum CflSign : uint8_t {
+  kCflSignZero = 0,
+  kCflSignNegative = 1,
+  kCflSignPositive = 2
+};
+
+// For each possible value of the combined signs (which is read from the
+// bitstream), this array stores the following: sign_u, sign_v, alpha_u_context,
+// alpha_v_context. Only positive entries are used. Entry at index i is computed
+// as follows:
+// sign_u = i / 3
+// sign_v = i % 3
+// alpha_u_context = i - 2
+// alpha_v_context = (sign_v - 1) * 3 + sign_u
+constexpr int8_t kCflAlphaLookup[kCflAlphaSignsSymbolCount][4] = {
+    {0, 1, -2, 0}, {0, 2, -1, 3}, {1, 0, 0, -2}, {1, 1, 1, 1},
+    {1, 2, 2, 4},  {2, 0, 3, -1}, {2, 1, 4, 2},  {2, 2, 5, 5},
+};
+
+constexpr BitMaskSet kPredictionModeHasNearMvMask(kPredictionModeNearMv,
+                                                  kPredictionModeNearNearMv,
+                                                  kPredictionModeNearNewMv,
+                                                  kPredictionModeNewNearMv);
+
+constexpr BitMaskSet kIsInterIntraModeAllowedMask(kBlock8x8, kBlock8x16,
+                                                  kBlock16x8, kBlock16x16,
+                                                  kBlock16x32, kBlock32x16,
+                                                  kBlock32x32);
+
+bool IsBackwardReference(ReferenceFrameType type) {
+  return type >= kReferenceFrameBackward && type <= kReferenceFrameAlternate;
+}
+
+bool IsSameDirectionReferencePair(ReferenceFrameType type1,
+                                  ReferenceFrameType type2) {
+  return (type1 >= kReferenceFrameBackward) ==
+         (type2 >= kReferenceFrameBackward);
+}
+
+// This is called neg_deinterleave() in the spec.
+int DecodeSegmentId(int diff, int reference, int max) {
+  if (reference == 0) return diff;
+  if (reference >= max - 1) return max - diff - 1;
+  const int value = ((diff & 1) != 0) ? reference + ((diff + 1) >> 1)
+                                      : reference - (diff >> 1);
+  const int reference2 = (reference << 1);
+  if (reference2 < max) {
+    return (diff <= reference2) ? value : diff;
+  }
+  return (diff <= ((max - reference - 1) << 1)) ? value : max - (diff + 1);
+}
+
+// This is called DrlCtxStack in section 7.10.2.14 of the spec.
+// In the spec, the weights of all the nearest mvs are incremented by a bonus
+// weight which is larger than any natural weight, and the weights of the mvs
+// are compared with this bonus weight to determine their contexts. We replace
+// this procedure by introducing |nearest_mv_count| in PredictionParameters,
+// which records the count of the nearest mvs. Since all the nearest mvs are in
+// the beginning of the mv stack, the |index| of a mv in the mv stack can be
+// compared with |nearest_mv_count| to get that mv's context.
+int GetRefMvIndexContext(int nearest_mv_count, int index) {
+  if (index + 1 < nearest_mv_count) {
+    return 0;
+  }
+  if (index + 1 == nearest_mv_count) {
+    return 1;
+  }
+  return 2;
+}
+
+// Returns true if both the width and height of the block is less than 64.
+bool IsBlockDimensionLessThan64(BlockSize size) {
+  return size <= kBlock32x32 && size != kBlock16x64;
+}
+
+int GetUseCompoundReferenceContext(const Tile::Block& block) {
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    if (block.IsTopSingle() && block.IsLeftSingle()) {
+      return static_cast<int>(IsBackwardReference(block.TopReference(0))) ^
+             static_cast<int>(IsBackwardReference(block.LeftReference(0)));
+    }
+    if (block.IsTopSingle()) {
+      return 2 + static_cast<int>(IsBackwardReference(block.TopReference(0)) ||
+                                  block.IsTopIntra());
+    }
+    if (block.IsLeftSingle()) {
+      return 2 + static_cast<int>(IsBackwardReference(block.LeftReference(0)) ||
+                                  block.IsLeftIntra());
+    }
+    return 4;
+  }
+  if (block.top_available[kPlaneY]) {
+    return block.IsTopSingle()
+               ? static_cast<int>(IsBackwardReference(block.TopReference(0)))
+               : 3;
+  }
+  if (block.left_available[kPlaneY]) {
+    return block.IsLeftSingle()
+               ? static_cast<int>(IsBackwardReference(block.LeftReference(0)))
+               : 3;
+  }
+  return 1;
+}
+
+// Calculates count0 by calling block.CountReferences() on the frame types from
+// type0_start to type0_end, inclusive, and summing the results.
+// Calculates count1 by calling block.CountReferences() on the frame types from
+// type1_start to type1_end, inclusive, and summing the results.
+// Compares count0 with count1 and returns 0, 1 or 2.
+//
+// See count_refs and ref_count_ctx in 8.3.2.
+int GetReferenceContext(const Tile::Block& block,
+                        ReferenceFrameType type0_start,
+                        ReferenceFrameType type0_end,
+                        ReferenceFrameType type1_start,
+                        ReferenceFrameType type1_end) {
+  int count0 = 0;
+  int count1 = 0;
+  for (int type = type0_start; type <= type0_end; ++type) {
+    count0 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+  }
+  for (int type = type1_start; type <= type1_end; ++type) {
+    count1 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+  }
+  return (count0 < count1) ? 0 : (count0 == count1 ? 1 : 2);
+}
+
+}  // namespace
+
+bool Tile::ReadSegmentId(const Block& block) {
+  // These two asserts ensure that current_frame_.segmentation_map() is not
+  // nullptr.
+  assert(frame_header_.segmentation.enabled);
+  assert(frame_header_.segmentation.update_map);
+  const SegmentationMap& map = *current_frame_.segmentation_map();
+  int top_left = -1;
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    top_left = map.segment_id(block.row4x4 - 1, block.column4x4 - 1);
+  }
+  int top = -1;
+  if (block.top_available[kPlaneY]) {
+    top = map.segment_id(block.row4x4 - 1, block.column4x4);
+  }
+  int left = -1;
+  if (block.left_available[kPlaneY]) {
+    left = map.segment_id(block.row4x4, block.column4x4 - 1);
+  }
+  int pred;
+  if (top == -1) {
+    pred = (left == -1) ? 0 : left;
+  } else if (left == -1) {
+    pred = top;
+  } else {
+    pred = (top_left == top) ? top : left;
+  }
+  BlockParameters& bp = *block.bp;
+  if (bp.skip) {
+    bp.prediction_parameters->segment_id = pred;
+    return true;
+  }
+  int context = 0;
+  if (top_left < 0) {
+    context = 0;
+  } else if (top_left == top && top_left == left) {
+    context = 2;
+  } else if (top_left == top || top_left == left || top == left) {
+    context = 1;
+  }
+  uint16_t* const segment_id_cdf =
+      symbol_decoder_context_.segment_id_cdf[context];
+  const int encoded_segment_id =
+      reader_.ReadSymbol<kMaxSegments>(segment_id_cdf);
+  bp.prediction_parameters->segment_id =
+      DecodeSegmentId(encoded_segment_id, pred,
+                      frame_header_.segmentation.last_active_segment_id + 1);
+  // Check the bitstream conformance requirement in Section 6.10.8 of the spec.
+  if (bp.prediction_parameters->segment_id < 0 ||
+      bp.prediction_parameters->segment_id >
+          frame_header_.segmentation.last_active_segment_id) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "Corrupted segment_ids: encoded %d, last active %d, postprocessed %d",
+        encoded_segment_id, frame_header_.segmentation.last_active_segment_id,
+        bp.prediction_parameters->segment_id);
+    return false;
+  }
+  return true;
+}
+
+bool Tile::ReadIntraSegmentId(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.segmentation.enabled) {
+    bp.prediction_parameters->segment_id = 0;
+    return true;
+  }
+  return ReadSegmentId(block);
+}
+
+void Tile::ReadSkip(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.segmentation.segment_id_pre_skip &&
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip)) {
+    bp.skip = true;
+    return;
+  }
+  int context = 0;
+  if (block.top_available[kPlaneY] && block.bp_top->skip) {
+    ++context;
+  }
+  if (block.left_available[kPlaneY] && block.bp_left->skip) {
+    ++context;
+  }
+  uint16_t* const skip_cdf = symbol_decoder_context_.skip_cdf[context];
+  bp.skip = reader_.ReadSymbol(skip_cdf);
+}
+
+bool Tile::ReadSkipMode(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.skip_mode_present ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id,
+          kSegmentFeatureReferenceFrame) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv) ||
+      IsBlockDimension4(block.size)) {
+    return false;
+  }
+  const int context =
+      (block.left_available[kPlaneY]
+           ? static_cast<int>(left_context_.skip_mode[block.left_context_index])
+           : 0) +
+      (block.top_available[kPlaneY]
+           ? static_cast<int>(
+                 block.top_context->skip_mode[block.top_context_index])
+           : 0);
+  return reader_.ReadSymbol(symbol_decoder_context_.skip_mode_cdf[context]);
+}
+
+void Tile::ReadCdef(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (bp.skip || frame_header_.coded_lossless ||
+      !sequence_header_.enable_cdef || frame_header_.allow_intrabc ||
+      frame_header_.cdef.bits == 0) {
+    return;
+  }
+  int8_t* const cdef_index =
+      &cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)];
+  int stride = cdef_index_.columns();
+  if (cdef_index[0] == -1) {
+    cdef_index[0] =
+        static_cast<int8_t>(reader_.ReadLiteral(frame_header_.cdef.bits));
+    if (block.size == kBlock128x128) {
+      // This condition is shorthand for block.width4x4 > 16 && block.height4x4
+      // > 16.
+      cdef_index[1] = cdef_index[0];
+      cdef_index[stride] = cdef_index[0];
+      cdef_index[stride + 1] = cdef_index[0];
+    } else if (block.width4x4 > 16) {
+      cdef_index[1] = cdef_index[0];
+    } else if (block.height4x4 > 16) {
+      cdef_index[stride] = cdef_index[0];
+    }
+  }
+}
+
+int Tile::ReadAndClipDelta(uint16_t* const cdf, int delta_small, int scale,
+                           int min_value, int max_value, int value) {
+  int abs = reader_.ReadSymbol<kDeltaSymbolCount>(cdf);
+  if (abs == delta_small) {
+    const int remaining_bit_count =
+        static_cast<int>(reader_.ReadLiteral(3)) + 1;
+    const int abs_remaining_bits =
+        static_cast<int>(reader_.ReadLiteral(remaining_bit_count));
+    abs = abs_remaining_bits + (1 << remaining_bit_count) + 1;
+  }
+  if (abs != 0) {
+    const bool sign = reader_.ReadBit() != 0;
+    const int scaled_abs = abs << scale;
+    const int reduced_delta = sign ? -scaled_abs : scaled_abs;
+    value += reduced_delta;
+    value = Clip3(value, min_value, max_value);
+  }
+  return value;
+}
+
+void Tile::ReadQuantizerIndexDelta(const Block& block) {
+  assert(read_deltas_);
+  BlockParameters& bp = *block.bp;
+  if ((block.size == SuperBlockSize() && bp.skip)) {
+    return;
+  }
+  current_quantizer_index_ =
+      ReadAndClipDelta(symbol_decoder_context_.delta_q_cdf, kDeltaQSmall,
+                       frame_header_.delta_q.scale, kMinLossyQuantizer,
+                       kMaxQuantizer, current_quantizer_index_);
+}
+
+void Tile::ReadLoopFilterDelta(const Block& block) {
+  assert(read_deltas_);
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.delta_lf.present ||
+      (block.size == SuperBlockSize() && bp.skip)) {
+    return;
+  }
+  int frame_lf_count = 1;
+  if (frame_header_.delta_lf.multi) {
+    frame_lf_count = kFrameLfCount - (PlaneCount() > 1 ? 0 : 2);
+  }
+  bool recompute_deblock_filter_levels = false;
+  for (int i = 0; i < frame_lf_count; ++i) {
+    uint16_t* const delta_lf_abs_cdf =
+        frame_header_.delta_lf.multi
+            ? symbol_decoder_context_.delta_lf_multi_cdf[i]
+            : symbol_decoder_context_.delta_lf_cdf;
+    const int8_t old_delta_lf = delta_lf_[i];
+    delta_lf_[i] = ReadAndClipDelta(
+        delta_lf_abs_cdf, kDeltaLfSmall, frame_header_.delta_lf.scale,
+        -kMaxLoopFilterValue, kMaxLoopFilterValue, delta_lf_[i]);
+    recompute_deblock_filter_levels =
+        recompute_deblock_filter_levels || (old_delta_lf != delta_lf_[i]);
+  }
+  delta_lf_all_zero_ =
+      (delta_lf_[0] | delta_lf_[1] | delta_lf_[2] | delta_lf_[3]) == 0;
+  if (!delta_lf_all_zero_ && recompute_deblock_filter_levels) {
+    post_filter_.ComputeDeblockFilterLevels(delta_lf_, deblock_filter_levels_);
+  }
+}
+
+void Tile::ReadPredictionModeY(const Block& block, bool intra_y_mode) {
+  uint16_t* cdf;
+  if (intra_y_mode) {
+    const PredictionMode top_mode =
+        block.top_available[kPlaneY] ? block.bp_top->y_mode : kPredictionModeDc;
+    const PredictionMode left_mode = block.left_available[kPlaneY]
+                                         ? block.bp_left->y_mode
+                                         : kPredictionModeDc;
+    const int top_context = kIntraYModeContext[top_mode];
+    const int left_context = kIntraYModeContext[left_mode];
+    cdf = symbol_decoder_context_
+              .intra_frame_y_mode_cdf[top_context][left_context];
+  } else {
+    cdf = symbol_decoder_context_.y_mode_cdf[kSizeGroup[block.size]];
+  }
+  block.bp->y_mode = static_cast<PredictionMode>(
+      reader_.ReadSymbol<kIntraPredictionModesY>(cdf));
+}
+
+void Tile::ReadIntraAngleInfo(const Block& block, PlaneType plane_type) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.angle_delta[plane_type] = 0;
+  const PredictionMode mode = (plane_type == kPlaneTypeY)
+                                  ? bp.y_mode
+                                  : bp.prediction_parameters->uv_mode;
+  if (IsBlockSmallerThan8x8(block.size) || !IsDirectionalMode(mode)) return;
+  uint16_t* const cdf =
+      symbol_decoder_context_.angle_delta_cdf[mode - kPredictionModeVertical];
+  prediction_parameters.angle_delta[plane_type] =
+      reader_.ReadSymbol<kAngleDeltaSymbolCount>(cdf);
+  prediction_parameters.angle_delta[plane_type] -= kMaxAngleDelta;
+}
+
+void Tile::ReadCflAlpha(const Block& block) {
+  const int signs = reader_.ReadSymbol<kCflAlphaSignsSymbolCount>(
+      symbol_decoder_context_.cfl_alpha_signs_cdf);
+  const int8_t* const cfl_lookup = kCflAlphaLookup[signs];
+  const auto sign_u = static_cast<CflSign>(cfl_lookup[0]);
+  const auto sign_v = static_cast<CflSign>(cfl_lookup[1]);
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.cfl_alpha_u = 0;
+  if (sign_u != kCflSignZero) {
+    assert(cfl_lookup[2] >= 0);
+    prediction_parameters.cfl_alpha_u =
+        reader_.ReadSymbol<kCflAlphaSymbolCount>(
+            symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[2]]) +
+        1;
+    if (sign_u == kCflSignNegative) prediction_parameters.cfl_alpha_u *= -1;
+  }
+  prediction_parameters.cfl_alpha_v = 0;
+  if (sign_v != kCflSignZero) {
+    assert(cfl_lookup[3] >= 0);
+    prediction_parameters.cfl_alpha_v =
+        reader_.ReadSymbol<kCflAlphaSymbolCount>(
+            symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[3]]) +
+        1;
+    if (sign_v == kCflSignNegative) prediction_parameters.cfl_alpha_v *= -1;
+  }
+}
+
+void Tile::ReadPredictionModeUV(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bool chroma_from_luma_allowed;
+  if (frame_header_.segmentation
+          .lossless[bp.prediction_parameters->segment_id]) {
+    chroma_from_luma_allowed = block.residual_size[kPlaneU] == kBlock4x4;
+  } else {
+    chroma_from_luma_allowed = IsBlockDimensionLessThan64(block.size);
+  }
+  uint16_t* const cdf =
+      symbol_decoder_context_
+          .uv_mode_cdf[static_cast<int>(chroma_from_luma_allowed)][bp.y_mode];
+  if (chroma_from_luma_allowed) {
+    bp.prediction_parameters->uv_mode = static_cast<PredictionMode>(
+        reader_.ReadSymbol<kIntraPredictionModesUV>(cdf));
+  } else {
+    bp.prediction_parameters->uv_mode = static_cast<PredictionMode>(
+        reader_.ReadSymbol<kIntraPredictionModesUV - 1>(cdf));
+  }
+}
+
+int Tile::ReadMotionVectorComponent(const Block& block, const int component) {
+  const int context =
+      static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+  const bool sign = reader_.ReadSymbol(
+      symbol_decoder_context_.mv_sign_cdf[component][context]);
+  const int mv_class = reader_.ReadSymbol<kMvClassSymbolCount>(
+      symbol_decoder_context_.mv_class_cdf[component][context]);
+  int magnitude = 1;
+  int value;
+  uint16_t* fraction_cdf;
+  uint16_t* precision_cdf;
+  if (mv_class == 0) {
+    value = static_cast<int>(reader_.ReadSymbol(
+        symbol_decoder_context_.mv_class0_bit_cdf[component][context]));
+    fraction_cdf = symbol_decoder_context_
+                       .mv_class0_fraction_cdf[component][context][value];
+    precision_cdf = symbol_decoder_context_
+                        .mv_class0_high_precision_cdf[component][context];
+  } else {
+    assert(mv_class <= kMvBitSymbolCount);
+    value = 0;
+    for (int i = 0; i < mv_class; ++i) {
+      const int bit = static_cast<int>(reader_.ReadSymbol(
+          symbol_decoder_context_.mv_bit_cdf[component][context][i]));
+      value |= bit << i;
+    }
+    magnitude += 2 << (mv_class + 2);
+    fraction_cdf = symbol_decoder_context_.mv_fraction_cdf[component][context];
+    precision_cdf =
+        symbol_decoder_context_.mv_high_precision_cdf[component][context];
+  }
+  const int fraction =
+      (frame_header_.force_integer_mv == 0)
+          ? reader_.ReadSymbol<kMvFractionSymbolCount>(fraction_cdf)
+          : 3;
+  const int precision =
+      frame_header_.allow_high_precision_mv
+          ? static_cast<int>(reader_.ReadSymbol(precision_cdf))
+          : 1;
+  magnitude += (value << 3) | (fraction << 1) | precision;
+  return sign ? -magnitude : magnitude;
+}
+
+void Tile::ReadMotionVector(const Block& block, int index) {
+  BlockParameters& bp = *block.bp;
+  const int context =
+      static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+  const auto mv_joint =
+      static_cast<MvJointType>(reader_.ReadSymbol<kNumMvJointTypes>(
+          symbol_decoder_context_.mv_joint_cdf[context]));
+  if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero ||
+      mv_joint == kMvJointTypeNonZero) {
+    bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0);
+  }
+  if (mv_joint == kMvJointTypeHorizontalNonZeroVerticalZero ||
+      mv_joint == kMvJointTypeNonZero) {
+    bp.mv.mv[index].mv[1] = ReadMotionVectorComponent(block, 1);
+  }
+}
+
+void Tile::ReadFilterIntraModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.use_filter_intra = false;
+  if (!sequence_header_.enable_filter_intra || bp.y_mode != kPredictionModeDc ||
+      bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] != 0 ||
+      !IsBlockDimensionLessThan64(block.size)) {
+    return;
+  }
+  prediction_parameters.use_filter_intra = reader_.ReadSymbol(
+      symbol_decoder_context_.use_filter_intra_cdf[block.size]);
+  if (prediction_parameters.use_filter_intra) {
+    prediction_parameters.filter_intra_mode = static_cast<FilterIntraPredictor>(
+        reader_.ReadSymbol<kNumFilterIntraPredictors>(
+            symbol_decoder_context_.filter_intra_mode_cdf));
+  }
+}
+
+bool Tile::DecodeIntraModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bp.skip = false;
+  if (frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadIntraSegmentId(block)) {
+    return false;
+  }
+  SetCdfContextSkipMode(block, false);
+  ReadSkip(block);
+  if (!frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadIntraSegmentId(block)) {
+    return false;
+  }
+  ReadCdef(block);
+  if (read_deltas_) {
+    ReadQuantizerIndexDelta(block);
+    ReadLoopFilterDelta(block);
+    read_deltas_ = false;
+  }
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.use_intra_block_copy = false;
+  if (frame_header_.allow_intrabc) {
+    prediction_parameters.use_intra_block_copy =
+        reader_.ReadSymbol(symbol_decoder_context_.intra_block_copy_cdf);
+  }
+  if (prediction_parameters.use_intra_block_copy) {
+    bp.is_inter = true;
+    bp.reference_frame[0] = kReferenceFrameIntra;
+    bp.reference_frame[1] = kReferenceFrameNone;
+    bp.y_mode = kPredictionModeDc;
+    bp.prediction_parameters->uv_mode = kPredictionModeDc;
+    SetCdfContextUVMode(block);
+    prediction_parameters.motion_mode = kMotionModeSimple;
+    prediction_parameters.compound_prediction_type =
+        kCompoundPredictionTypeAverage;
+    bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+    bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+    SetCdfContextPaletteSize(block);
+    bp.interpolation_filter[0] = kInterpolationFilterBilinear;
+    bp.interpolation_filter[1] = kInterpolationFilterBilinear;
+    MvContexts dummy_mode_contexts;
+    FindMvStack(block, /*is_compound=*/false, &dummy_mode_contexts);
+    return AssignIntraMv(block);
+  }
+  bp.is_inter = false;
+  return ReadIntraBlockModeInfo(block, /*intra_y_mode=*/true);
+}
+
+int8_t Tile::ComputePredictedSegmentId(const Block& block) const {
+  // If prev_segment_ids_ is null, treat it as if it pointed to a segmentation
+  // map containing all 0s.
+  if (prev_segment_ids_ == nullptr) return 0;
+
+  const int x_limit = std::min(frame_header_.columns4x4 - block.column4x4,
+                               static_cast<int>(block.width4x4));
+  const int y_limit = std::min(frame_header_.rows4x4 - block.row4x4,
+                               static_cast<int>(block.height4x4));
+  int8_t id = 7;
+  for (int y = 0; y < y_limit; ++y) {
+    for (int x = 0; x < x_limit; ++x) {
+      const int8_t prev_segment_id =
+          prev_segment_ids_->segment_id(block.row4x4 + y, block.column4x4 + x);
+      id = std::min(id, prev_segment_id);
+    }
+  }
+  return id;
+}
+
+void Tile::SetCdfContextUsePredictedSegmentId(const Block& block,
+                                              bool use_predicted_segment_id) {
+  memset(left_context_.use_predicted_segment_id + block.left_context_index,
+         static_cast<int>(use_predicted_segment_id), block.height4x4);
+  memset(block.top_context->use_predicted_segment_id + block.top_context_index,
+         static_cast<int>(use_predicted_segment_id), block.width4x4);
+}
+
+bool Tile::ReadInterSegmentId(const Block& block, bool pre_skip) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.segmentation.enabled) {
+    bp.prediction_parameters->segment_id = 0;
+    return true;
+  }
+  if (!frame_header_.segmentation.update_map) {
+    bp.prediction_parameters->segment_id = ComputePredictedSegmentId(block);
+    return true;
+  }
+  if (pre_skip) {
+    if (!frame_header_.segmentation.segment_id_pre_skip) {
+      bp.prediction_parameters->segment_id = 0;
+      return true;
+    }
+  } else if (bp.skip) {
+    SetCdfContextUsePredictedSegmentId(block, false);
+    return ReadSegmentId(block);
+  }
+  if (frame_header_.segmentation.temporal_update) {
+    const int context =
+        (block.left_available[kPlaneY]
+             ? static_cast<int>(
+                   left_context_
+                       .use_predicted_segment_id[block.left_context_index])
+             : 0) +
+        (block.top_available[kPlaneY]
+             ? static_cast<int>(
+                   block.top_context
+                       ->use_predicted_segment_id[block.top_context_index])
+             : 0);
+    const bool use_predicted_segment_id = reader_.ReadSymbol(
+        symbol_decoder_context_.use_predicted_segment_id_cdf[context]);
+    SetCdfContextUsePredictedSegmentId(block, use_predicted_segment_id);
+    if (use_predicted_segment_id) {
+      bp.prediction_parameters->segment_id = ComputePredictedSegmentId(block);
+      return true;
+    }
+  }
+  return ReadSegmentId(block);
+}
+
+void Tile::ReadIsInter(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (skip_mode) {
+    bp.is_inter = true;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id,
+          kSegmentFeatureReferenceFrame)) {
+    bp.is_inter = frame_header_.segmentation
+                      .feature_data[bp.prediction_parameters->segment_id]
+                                   [kSegmentFeatureReferenceFrame] !=
+                  kReferenceFrameIntra;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+    bp.is_inter = true;
+    return;
+  }
+  int context = 0;
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    context = (block.IsTopIntra() && block.IsLeftIntra())
+                  ? 3
+                  : static_cast<int>(block.IsTopIntra() || block.IsLeftIntra());
+  } else if (block.top_available[kPlaneY] || block.left_available[kPlaneY]) {
+    context = 2 * static_cast<int>(block.top_available[kPlaneY]
+                                       ? block.IsTopIntra()
+                                       : block.IsLeftIntra());
+  }
+  bp.is_inter =
+      reader_.ReadSymbol(symbol_decoder_context_.is_inter_cdf[context]);
+}
+
+void Tile::SetCdfContextPaletteSize(const Block& block) {
+  const PaletteModeInfo& palette_mode_info =
+      block.bp->prediction_parameters->palette_mode_info;
+  for (int plane_type = kPlaneTypeY; plane_type <= kPlaneTypeUV; ++plane_type) {
+    memset(left_context_.palette_size[plane_type] + block.left_context_index,
+           palette_mode_info.size[plane_type], block.height4x4);
+    memset(
+        block.top_context->palette_size[plane_type] + block.top_context_index,
+        palette_mode_info.size[plane_type], block.width4x4);
+    if (palette_mode_info.size[plane_type] == 0) continue;
+    for (int i = block.left_context_index;
+         i < block.left_context_index + block.height4x4; ++i) {
+      memcpy(left_context_.palette_color[i][plane_type],
+             palette_mode_info.color[plane_type],
+             kMaxPaletteSize * sizeof(palette_mode_info.color[0][0]));
+    }
+    for (int i = block.top_context_index;
+         i < block.top_context_index + block.width4x4; ++i) {
+      memcpy(block.top_context->palette_color[i][plane_type],
+             palette_mode_info.color[plane_type],
+             kMaxPaletteSize * sizeof(palette_mode_info.color[0][0]));
+    }
+  }
+}
+
+void Tile::SetCdfContextUVMode(const Block& block) {
+  // BlockCdfContext.uv_mode is only used to compute is_smooth_prediction for
+  // the intra edge upsamplers in the subsequent blocks. They have some special
+  // rules for subsampled UV planes. For subsampled UV planes, update left
+  // context only if current block contains the last odd column and update top
+  // context only if current block contains the last odd row.
+  if (subsampling_x_[kPlaneU] == 0 || (block.column4x4 & 1) == 1 ||
+      block.width4x4 > 1) {
+    memset(left_context_.uv_mode + block.left_context_index,
+           block.bp->prediction_parameters->uv_mode, block.height4x4);
+  }
+  if (subsampling_y_[kPlaneU] == 0 || (block.row4x4 & 1) == 1 ||
+      block.height4x4 > 1) {
+    memset(block.top_context->uv_mode + block.top_context_index,
+           block.bp->prediction_parameters->uv_mode, block.width4x4);
+  }
+}
+
+bool Tile::ReadIntraBlockModeInfo(const Block& block, bool intra_y_mode) {
+  BlockParameters& bp = *block.bp;
+  bp.reference_frame[0] = kReferenceFrameIntra;
+  bp.reference_frame[1] = kReferenceFrameNone;
+  ReadPredictionModeY(block, intra_y_mode);
+  ReadIntraAngleInfo(block, kPlaneTypeY);
+  if (block.HasChroma()) {
+    ReadPredictionModeUV(block);
+    if (bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
+      ReadCflAlpha(block);
+    }
+    if (block.left_available[kPlaneU]) {
+      const int smooth_row =
+          block.row4x4 + (~block.row4x4 & subsampling_y_[kPlaneU]);
+      const int smooth_column =
+          block.column4x4 - 1 - (block.column4x4 & subsampling_x_[kPlaneU]);
+      const BlockParameters& bp_left =
+          *block_parameters_holder_.Find(smooth_row, smooth_column);
+      bp.prediction_parameters->chroma_left_uses_smooth_prediction =
+          (bp_left.reference_frame[0] <= kReferenceFrameIntra) &&
+          kPredictionModeSmoothMask.Contains(
+              left_context_.uv_mode[CdfContextIndex(smooth_row)]);
+    }
+    if (block.top_available[kPlaneU]) {
+      const int smooth_row =
+          block.row4x4 - 1 - (block.row4x4 & subsampling_y_[kPlaneU]);
+      const int smooth_column =
+          block.column4x4 + (~block.column4x4 & subsampling_x_[kPlaneU]);
+      const BlockParameters& bp_top =
+          *block_parameters_holder_.Find(smooth_row, smooth_column);
+      bp.prediction_parameters->chroma_top_uses_smooth_prediction =
+          (bp_top.reference_frame[0] <= kReferenceFrameIntra) &&
+          kPredictionModeSmoothMask.Contains(
+              top_context_.get()[SuperBlockColumnIndex(smooth_column)]
+                  .uv_mode[CdfContextIndex(smooth_column)]);
+    }
+    SetCdfContextUVMode(block);
+    ReadIntraAngleInfo(block, kPlaneTypeUV);
+  }
+  ReadPaletteModeInfo(block);
+  SetCdfContextPaletteSize(block);
+  ReadFilterIntraModeInfo(block);
+  return true;
+}
+
+CompoundReferenceType Tile::ReadCompoundReferenceType(const Block& block) {
+  // compound and inter.
+  const bool top_comp_inter = block.top_available[kPlaneY] &&
+                              !block.IsTopIntra() && !block.IsTopSingle();
+  const bool left_comp_inter = block.left_available[kPlaneY] &&
+                               !block.IsLeftIntra() && !block.IsLeftSingle();
+  // unidirectional compound.
+  const bool top_uni_comp =
+      top_comp_inter && IsSameDirectionReferencePair(block.TopReference(0),
+                                                     block.TopReference(1));
+  const bool left_uni_comp =
+      left_comp_inter && IsSameDirectionReferencePair(block.LeftReference(0),
+                                                      block.LeftReference(1));
+  int context;
+  if (block.top_available[kPlaneY] && !block.IsTopIntra() &&
+      block.left_available[kPlaneY] && !block.IsLeftIntra()) {
+    const int same_direction = static_cast<int>(IsSameDirectionReferencePair(
+        block.TopReference(0), block.LeftReference(0)));
+    if (!top_comp_inter && !left_comp_inter) {
+      context = 1 + MultiplyBy2(same_direction);
+    } else if (!top_comp_inter) {
+      context = left_uni_comp ? 3 + same_direction : 1;
+    } else if (!left_comp_inter) {
+      context = top_uni_comp ? 3 + same_direction : 1;
+    } else {
+      if (!top_uni_comp && !left_uni_comp) {
+        context = 0;
+      } else if (!top_uni_comp || !left_uni_comp) {
+        context = 2;
+      } else {
+        context = 3 + static_cast<int>(
+                          (block.TopReference(0) == kReferenceFrameBackward) ==
+                          (block.LeftReference(0) == kReferenceFrameBackward));
+      }
+    }
+  } else if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    if (top_comp_inter) {
+      context = 1 + MultiplyBy2(static_cast<int>(top_uni_comp));
+    } else if (left_comp_inter) {
+      context = 1 + MultiplyBy2(static_cast<int>(left_uni_comp));
+    } else {
+      context = 2;
+    }
+  } else if (top_comp_inter) {
+    context = MultiplyBy4(static_cast<int>(top_uni_comp));
+  } else if (left_comp_inter) {
+    context = MultiplyBy4(static_cast<int>(left_uni_comp));
+  } else {
+    context = 2;
+  }
+  return static_cast<CompoundReferenceType>(reader_.ReadSymbol(
+      symbol_decoder_context_.compound_reference_type_cdf[context]));
+}
+
+template <bool is_single, bool is_backward, int index>
+uint16_t* Tile::GetReferenceCdf(
+    const Block& block,
+    CompoundReferenceType type /*= kNumCompoundReferenceTypes*/) {
+  int context = 0;
+  if ((type == kCompoundReferenceUnidirectional && index == 0) ||
+      (is_single && index == 1)) {
+    // uni_comp_ref and single_ref_p1.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameGolden,
+                            kReferenceFrameBackward, kReferenceFrameAlternate);
+  } else if (type == kCompoundReferenceUnidirectional && index == 1) {
+    // uni_comp_ref_p1.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast2, kReferenceFrameLast2,
+                            kReferenceFrameLast3, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceUnidirectional && index == 2) ||
+             (type == kCompoundReferenceBidirectional && index == 2) ||
+             (is_single && index == 5)) {
+    // uni_comp_ref_p2, comp_ref_p2 and single_ref_p5.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast3, kReferenceFrameLast3,
+                            kReferenceFrameGolden, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceBidirectional && index == 0) ||
+             (is_single && index == 3)) {
+    // comp_ref and single_ref_p3.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast2,
+                            kReferenceFrameLast3, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceBidirectional && index == 1) ||
+             (is_single && index == 4)) {
+    // comp_ref_p1 and single_ref_p4.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast,
+                            kReferenceFrameLast2, kReferenceFrameLast2);
+  } else if ((is_single && index == 2) || (is_backward && index == 0)) {
+    // single_ref_p2 and comp_bwdref.
+    context = GetReferenceContext(
+        block, kReferenceFrameBackward, kReferenceFrameAlternate2,
+        kReferenceFrameAlternate, kReferenceFrameAlternate);
+  } else if ((is_single && index == 6) || (is_backward && index == 1)) {
+    // single_ref_p6 and comp_bwdref_p1.
+    context = GetReferenceContext(
+        block, kReferenceFrameBackward, kReferenceFrameBackward,
+        kReferenceFrameAlternate2, kReferenceFrameAlternate2);
+  }
+  // When using GCC 12.x for some targets the compiler reports a false positive
+  // with the context subscript when is_single=false, is_backward=false and
+  // index=0. GetReferenceContext() can only return values between 0 and 2.
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+  assert(context >= 0 && context <= 2);
+  if (is_single) {
+    // The index parameter for single references is offset by one since the spec
+    // uses 1-based index for these elements.
+    return symbol_decoder_context_.single_reference_cdf[context][index - 1];
+  }
+  if (is_backward) {
+    return symbol_decoder_context_
+        .compound_backward_reference_cdf[context][index];
+  }
+  return symbol_decoder_context_.compound_reference_cdf[type][context][index];
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+}
+
+void Tile::ReadReferenceFrames(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (skip_mode) {
+    bp.reference_frame[0] = frame_header_.skip_mode_frame[0];
+    bp.reference_frame[1] = frame_header_.skip_mode_frame[1];
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id,
+          kSegmentFeatureReferenceFrame)) {
+    bp.reference_frame[0] = static_cast<ReferenceFrameType>(
+        frame_header_.segmentation
+            .feature_data[bp.prediction_parameters->segment_id]
+                         [kSegmentFeatureReferenceFrame]);
+    bp.reference_frame[1] = kReferenceFrameNone;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+    bp.reference_frame[0] = kReferenceFrameLast;
+    bp.reference_frame[1] = kReferenceFrameNone;
+    return;
+  }
+  const bool use_compound_reference =
+      frame_header_.reference_mode_select &&
+      std::min(block.width4x4, block.height4x4) >= 2 &&
+      reader_.ReadSymbol(symbol_decoder_context_.use_compound_reference_cdf
+                             [GetUseCompoundReferenceContext(block)]);
+  if (use_compound_reference) {
+    CompoundReferenceType reference_type = ReadCompoundReferenceType(block);
+    if (reference_type == kCompoundReferenceUnidirectional) {
+      // uni_comp_ref.
+      if (reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 0>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameBackward;
+        bp.reference_frame[1] = kReferenceFrameAlternate;
+        return;
+      }
+      // uni_comp_ref_p1.
+      if (!reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 1>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameLast;
+        bp.reference_frame[1] = kReferenceFrameLast2;
+        return;
+      }
+      // uni_comp_ref_p2.
+      if (reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 2>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameLast;
+        bp.reference_frame[1] = kReferenceFrameGolden;
+        return;
+      }
+      bp.reference_frame[0] = kReferenceFrameLast;
+      bp.reference_frame[1] = kReferenceFrameLast3;
+      return;
+    }
+    assert(reference_type == kCompoundReferenceBidirectional);
+    // comp_ref.
+    if (reader_.ReadSymbol(
+            GetReferenceCdf<false, false, 0>(block, reference_type))) {
+      // comp_ref_p2.
+      bp.reference_frame[0] =
+          reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 2>(block, reference_type))
+              ? kReferenceFrameGolden
+              : kReferenceFrameLast3;
+    } else {
+      // comp_ref_p1.
+      bp.reference_frame[0] =
+          reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 1>(block, reference_type))
+              ? kReferenceFrameLast2
+              : kReferenceFrameLast;
+    }
+    // comp_bwdref.
+    if (reader_.ReadSymbol(GetReferenceCdf<false, true, 0>(block))) {
+      bp.reference_frame[1] = kReferenceFrameAlternate;
+    } else {
+      // comp_bwdref_p1.
+      bp.reference_frame[1] =
+          reader_.ReadSymbol(GetReferenceCdf<false, true, 1>(block))
+              ? kReferenceFrameAlternate2
+              : kReferenceFrameBackward;
+    }
+    return;
+  }
+  assert(!use_compound_reference);
+  bp.reference_frame[1] = kReferenceFrameNone;
+  // single_ref_p1.
+  if (reader_.ReadSymbol(GetReferenceCdf<true, false, 1>(block))) {
+    // single_ref_p2.
+    if (reader_.ReadSymbol(GetReferenceCdf<true, false, 2>(block))) {
+      bp.reference_frame[0] = kReferenceFrameAlternate;
+      return;
+    }
+    // single_ref_p6.
+    bp.reference_frame[0] =
+        reader_.ReadSymbol(GetReferenceCdf<true, false, 6>(block))
+            ? kReferenceFrameAlternate2
+            : kReferenceFrameBackward;
+    return;
+  }
+  // single_ref_p3.
+  if (reader_.ReadSymbol(GetReferenceCdf<true, false, 3>(block))) {
+    // single_ref_p5.
+    bp.reference_frame[0] =
+        reader_.ReadSymbol(GetReferenceCdf<true, false, 5>(block))
+            ? kReferenceFrameGolden
+            : kReferenceFrameLast3;
+    return;
+  }
+  // single_ref_p4.
+  bp.reference_frame[0] =
+      reader_.ReadSymbol(GetReferenceCdf<true, false, 4>(block))
+          ? kReferenceFrameLast2
+          : kReferenceFrameLast;
+}
+
+void Tile::ReadInterPredictionModeY(const Block& block,
+                                    const MvContexts& mode_contexts,
+                                    bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (skip_mode) {
+    bp.y_mode = kPredictionModeNearestNearestMv;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+    bp.y_mode = kPredictionModeGlobalMv;
+    return;
+  }
+  if (bp.reference_frame[1] > kReferenceFrameIntra) {
+    const int idx0 = mode_contexts.reference_mv >> 1;
+    const int idx1 =
+        std::min(mode_contexts.new_mv, kCompoundModeNewMvContexts - 1);
+    const int context = kCompoundModeContextMap[idx0][idx1];
+    const int offset = reader_.ReadSymbol<kNumCompoundInterPredictionModes>(
+        symbol_decoder_context_.compound_prediction_mode_cdf[context]);
+    bp.y_mode =
+        static_cast<PredictionMode>(kPredictionModeNearestNearestMv + offset);
+    return;
+  }
+  // new_mv.
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_.new_mv_cdf[mode_contexts.new_mv])) {
+    bp.y_mode = kPredictionModeNewMv;
+    return;
+  }
+  // zero_mv.
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_.zero_mv_cdf[mode_contexts.zero_mv])) {
+    bp.y_mode = kPredictionModeGlobalMv;
+    return;
+  }
+  // ref_mv.
+  bp.y_mode =
+      reader_.ReadSymbol(
+          symbol_decoder_context_.reference_mv_cdf[mode_contexts.reference_mv])
+          ? kPredictionModeNearMv
+          : kPredictionModeNearestMv;
+}
+
+void Tile::ReadRefMvIndex(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.ref_mv_index = 0;
+  if (bp.y_mode != kPredictionModeNewMv &&
+      bp.y_mode != kPredictionModeNewNewMv &&
+      !kPredictionModeHasNearMvMask.Contains(bp.y_mode)) {
+    return;
+  }
+  const int start =
+      static_cast<int>(kPredictionModeHasNearMvMask.Contains(bp.y_mode));
+  prediction_parameters.ref_mv_index = start;
+  for (int i = start; i < start + 2; ++i) {
+    if (prediction_parameters.ref_mv_count <= i + 1) break;
+    // drl_mode in the spec.
+    const bool ref_mv_index_bit = reader_.ReadSymbol(
+        symbol_decoder_context_.ref_mv_index_cdf[GetRefMvIndexContext(
+            prediction_parameters.nearest_mv_count, i)]);
+    prediction_parameters.ref_mv_index = i + static_cast<int>(ref_mv_index_bit);
+    if (!ref_mv_index_bit) return;
+  }
+}
+
+void Tile::ReadInterIntraMode(const Block& block, bool is_compound,
+                              bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+  prediction_parameters.is_wedge_inter_intra = false;
+  if (skip_mode || !sequence_header_.enable_interintra_compound ||
+      is_compound || !kIsInterIntraModeAllowedMask.Contains(block.size)) {
+    return;
+  }
+  // kSizeGroup[block.size] is guaranteed to be non-zero because of the block
+  // size constraint enforced in the above condition.
+  assert(kSizeGroup[block.size] - 1 >= 0);
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_
+              .is_inter_intra_cdf[kSizeGroup[block.size] - 1])) {
+    prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+    return;
+  }
+  prediction_parameters.inter_intra_mode =
+      static_cast<InterIntraMode>(reader_.ReadSymbol<kNumInterIntraModes>(
+          symbol_decoder_context_
+              .inter_intra_mode_cdf[kSizeGroup[block.size] - 1]));
+  bp.reference_frame[1] = kReferenceFrameIntra;
+  prediction_parameters.angle_delta[kPlaneTypeY] = 0;
+  prediction_parameters.angle_delta[kPlaneTypeUV] = 0;
+  prediction_parameters.use_filter_intra = false;
+  prediction_parameters.is_wedge_inter_intra = reader_.ReadSymbol(
+      symbol_decoder_context_.is_wedge_inter_intra_cdf[block.size]);
+  if (!prediction_parameters.is_wedge_inter_intra) return;
+  prediction_parameters.wedge_index =
+      reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+          symbol_decoder_context_.wedge_index_cdf[block.size]);
+  prediction_parameters.wedge_sign = 0;
+}
+
+void Tile::ReadMotionMode(const Block& block, bool is_compound,
+                          bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  const auto global_motion_type =
+      frame_header_.global_motion[bp.reference_frame[0]].type;
+  if (skip_mode || !frame_header_.is_motion_mode_switchable ||
+      IsBlockDimension4(block.size) ||
+      (frame_header_.force_integer_mv == 0 &&
+       (bp.y_mode == kPredictionModeGlobalMv ||
+        bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+       global_motion_type > kGlobalMotionTransformationTypeTranslation) ||
+      is_compound || bp.reference_frame[1] == kReferenceFrameIntra ||
+      !block.HasOverlappableCandidates()) {
+    prediction_parameters.motion_mode = kMotionModeSimple;
+    return;
+  }
+  prediction_parameters.num_warp_samples = 0;
+  int num_samples_scanned = 0;
+  memset(prediction_parameters.warp_estimate_candidates, 0,
+         sizeof(prediction_parameters.warp_estimate_candidates));
+  FindWarpSamples(block, &prediction_parameters.num_warp_samples,
+                  &num_samples_scanned,
+                  prediction_parameters.warp_estimate_candidates);
+  if (frame_header_.force_integer_mv != 0 ||
+      prediction_parameters.num_warp_samples == 0 ||
+      !frame_header_.allow_warped_motion || IsScaled(bp.reference_frame[0])) {
+    prediction_parameters.motion_mode =
+        reader_.ReadSymbol(symbol_decoder_context_.use_obmc_cdf[block.size])
+            ? kMotionModeObmc
+            : kMotionModeSimple;
+    return;
+  }
+  prediction_parameters.motion_mode =
+      static_cast<MotionMode>(reader_.ReadSymbol<kNumMotionModes>(
+          symbol_decoder_context_.motion_mode_cdf[block.size]));
+}
+
+uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) {
+  int context = 0;
+  if (block.top_available[kPlaneY]) {
+    if (!block.IsTopSingle()) {
+      context += static_cast<int>(
+          block.top_context
+              ->is_explicit_compound_type[block.top_context_index]);
+    } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+      context += 3;
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    if (!block.IsLeftSingle()) {
+      context += static_cast<int>(
+          left_context_.is_explicit_compound_type[block.left_context_index]);
+    } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+      context += 3;
+    }
+  }
+  return symbol_decoder_context_.is_explicit_compound_type_cdf[std::min(
+      context, kIsExplicitCompoundTypeContexts - 1)];
+}
+
+uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) {
+  const BlockParameters& bp = *block.bp;
+  const ReferenceInfo& reference_info = *current_frame_.reference_info();
+  const int forward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]);
+  const int backward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]);
+  int context = (forward == backward) ? 3 : 0;
+  if (block.top_available[kPlaneY]) {
+    if (!block.IsTopSingle()) {
+      context += static_cast<int>(
+          block.top_context->is_compound_type_average[block.top_context_index]);
+    } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+      ++context;
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    if (!block.IsLeftSingle()) {
+      context += static_cast<int>(
+          left_context_.is_compound_type_average[block.left_context_index]);
+    } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+      ++context;
+    }
+  }
+  return symbol_decoder_context_.is_compound_type_average_cdf[context];
+}
+
+void Tile::ReadCompoundType(const Block& block, bool is_compound,
+                            bool skip_mode,
+                            bool* const is_explicit_compound_type,
+                            bool* const is_compound_type_average) {
+  *is_explicit_compound_type = false;
+  *is_compound_type_average = true;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  if (skip_mode) {
+    prediction_parameters.compound_prediction_type =
+        kCompoundPredictionTypeAverage;
+    return;
+  }
+  if (is_compound) {
+    if (sequence_header_.enable_masked_compound) {
+      *is_explicit_compound_type =
+          reader_.ReadSymbol(GetIsExplicitCompoundTypeCdf(block));
+    }
+    if (*is_explicit_compound_type) {
+      if (kIsWedgeCompoundModeAllowed.Contains(block.size)) {
+        // Only kCompoundPredictionTypeWedge and
+        // kCompoundPredictionTypeDiffWeighted are signaled explicitly.
+        prediction_parameters.compound_prediction_type =
+            static_cast<CompoundPredictionType>(reader_.ReadSymbol(
+                symbol_decoder_context_.compound_type_cdf[block.size]));
+      } else {
+        prediction_parameters.compound_prediction_type =
+            kCompoundPredictionTypeDiffWeighted;
+      }
+    } else {
+      if (sequence_header_.enable_jnt_comp) {
+        *is_compound_type_average =
+            reader_.ReadSymbol(GetIsCompoundTypeAverageCdf(block));
+        prediction_parameters.compound_prediction_type =
+            *is_compound_type_average ? kCompoundPredictionTypeAverage
+                                      : kCompoundPredictionTypeDistance;
+      } else {
+        prediction_parameters.compound_prediction_type =
+            kCompoundPredictionTypeAverage;
+        return;
+      }
+    }
+    if (prediction_parameters.compound_prediction_type ==
+        kCompoundPredictionTypeWedge) {
+      prediction_parameters.wedge_index =
+          reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+              symbol_decoder_context_.wedge_index_cdf[block.size]);
+      prediction_parameters.wedge_sign = static_cast<int>(reader_.ReadBit());
+    } else if (prediction_parameters.compound_prediction_type ==
+               kCompoundPredictionTypeDiffWeighted) {
+      prediction_parameters.mask_is_inverse = reader_.ReadBit() != 0;
+    }
+    return;
+  }
+  if (prediction_parameters.inter_intra_mode != kNumInterIntraModes) {
+    prediction_parameters.compound_prediction_type =
+        prediction_parameters.is_wedge_inter_intra
+            ? kCompoundPredictionTypeWedge
+            : kCompoundPredictionTypeIntra;
+    return;
+  }
+  prediction_parameters.compound_prediction_type =
+      kCompoundPredictionTypeAverage;
+}
+
+uint16_t* Tile::GetInterpolationFilterCdf(const Block& block, int direction) {
+  const BlockParameters& bp = *block.bp;
+  int context = MultiplyBy8(direction) +
+                MultiplyBy4(static_cast<int>(bp.reference_frame[1] >
+                                             kReferenceFrameIntra));
+  int top_type = kNumExplicitInterpolationFilters;
+  if (block.top_available[kPlaneY]) {
+    if (block.bp_top->reference_frame[0] == bp.reference_frame[0] ||
+        block.bp_top->reference_frame[1] == bp.reference_frame[0]) {
+      top_type = block.bp_top->interpolation_filter[direction];
+    }
+  }
+  int left_type = kNumExplicitInterpolationFilters;
+  if (block.left_available[kPlaneY]) {
+    if (block.bp_left->reference_frame[0] == bp.reference_frame[0] ||
+        block.bp_left->reference_frame[1] == bp.reference_frame[0]) {
+      left_type = block.bp_left->interpolation_filter[direction];
+    }
+  }
+  if (left_type == top_type) {
+    context += left_type;
+  } else if (left_type == kNumExplicitInterpolationFilters) {
+    context += top_type;
+  } else if (top_type == kNumExplicitInterpolationFilters) {
+    context += left_type;
+  } else {
+    context += kNumExplicitInterpolationFilters;
+  }
+  return symbol_decoder_context_.interpolation_filter_cdf[context];
+}
+
+void Tile::ReadInterpolationFilter(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.interpolation_filter != kInterpolationFilterSwitchable) {
+    static_assert(
+        sizeof(bp.interpolation_filter) / sizeof(bp.interpolation_filter[0]) ==
+            2,
+        "Interpolation filter array size is not 2");
+    for (auto& interpolation_filter : bp.interpolation_filter) {
+      interpolation_filter = frame_header_.interpolation_filter;
+    }
+    return;
+  }
+  bool interpolation_filter_present = true;
+  if (skip_mode ||
+      block.bp->prediction_parameters->motion_mode == kMotionModeLocalWarp) {
+    interpolation_filter_present = false;
+  } else if (!IsBlockDimension4(block.size) &&
+             bp.y_mode == kPredictionModeGlobalMv) {
+    interpolation_filter_present =
+        frame_header_.global_motion[bp.reference_frame[0]].type ==
+        kGlobalMotionTransformationTypeTranslation;
+  } else if (!IsBlockDimension4(block.size) &&
+             bp.y_mode == kPredictionModeGlobalGlobalMv) {
+    interpolation_filter_present =
+        frame_header_.global_motion[bp.reference_frame[0]].type ==
+            kGlobalMotionTransformationTypeTranslation ||
+        frame_header_.global_motion[bp.reference_frame[1]].type ==
+            kGlobalMotionTransformationTypeTranslation;
+  }
+  for (int i = 0; i < (sequence_header_.enable_dual_filter ? 2 : 1); ++i) {
+    bp.interpolation_filter[i] =
+        interpolation_filter_present
+            ? static_cast<InterpolationFilter>(
+                  reader_.ReadSymbol<kNumExplicitInterpolationFilters>(
+                      GetInterpolationFilterCdf(block, i)))
+            : kInterpolationFilterEightTap;
+  }
+  if (!sequence_header_.enable_dual_filter) {
+    bp.interpolation_filter[1] = bp.interpolation_filter[0];
+  }
+}
+
+void Tile::SetCdfContextCompoundType(const Block& block,
+                                     bool is_explicit_compound_type,
+                                     bool is_compound_type_average) {
+  memset(left_context_.is_explicit_compound_type + block.left_context_index,
+         static_cast<int>(is_explicit_compound_type), block.height4x4);
+  memset(left_context_.is_compound_type_average + block.left_context_index,
+         static_cast<int>(is_compound_type_average), block.height4x4);
+  memset(block.top_context->is_explicit_compound_type + block.top_context_index,
+         static_cast<int>(is_explicit_compound_type), block.width4x4);
+  memset(block.top_context->is_compound_type_average + block.top_context_index,
+         static_cast<int>(is_compound_type_average), block.width4x4);
+}
+
+bool Tile::ReadInterBlockModeInfo(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+  SetCdfContextPaletteSize(block);
+  ReadReferenceFrames(block, skip_mode);
+  const bool is_compound = bp.reference_frame[1] > kReferenceFrameIntra;
+  MvContexts mode_contexts;
+  FindMvStack(block, is_compound, &mode_contexts);
+  ReadInterPredictionModeY(block, mode_contexts, skip_mode);
+  ReadRefMvIndex(block);
+  if (!AssignInterMv(block, is_compound)) return false;
+  ReadInterIntraMode(block, is_compound, skip_mode);
+  ReadMotionMode(block, is_compound, skip_mode);
+  bool is_explicit_compound_type;
+  bool is_compound_type_average;
+  ReadCompoundType(block, is_compound, skip_mode, &is_explicit_compound_type,
+                   &is_compound_type_average);
+  SetCdfContextCompoundType(block, is_explicit_compound_type,
+                            is_compound_type_average);
+  ReadInterpolationFilter(block, skip_mode);
+  return true;
+}
+
+void Tile::SetCdfContextSkipMode(const Block& block, bool skip_mode) {
+  memset(left_context_.skip_mode + block.left_context_index,
+         static_cast<int>(skip_mode), block.height4x4);
+  memset(block.top_context->skip_mode + block.top_context_index,
+         static_cast<int>(skip_mode), block.width4x4);
+}
+
+bool Tile::DecodeInterModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  block.bp->prediction_parameters->use_intra_block_copy = false;
+  bp.skip = false;
+  if (!ReadInterSegmentId(block, /*pre_skip=*/true)) return false;
+  bool skip_mode = ReadSkipMode(block);
+  SetCdfContextSkipMode(block, skip_mode);
+  if (skip_mode) {
+    bp.skip = true;
+  } else {
+    ReadSkip(block);
+  }
+  if (!frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadInterSegmentId(block, /*pre_skip=*/false)) {
+    return false;
+  }
+  ReadCdef(block);
+  if (read_deltas_) {
+    ReadQuantizerIndexDelta(block);
+    ReadLoopFilterDelta(block);
+    read_deltas_ = false;
+  }
+  ReadIsInter(block, skip_mode);
+  return bp.is_inter ? ReadInterBlockModeInfo(block, skip_mode)
+                     : ReadIntraBlockModeInfo(block, /*intra_y_mode=*/false);
+}
+
+bool Tile::DecodeModeInfo(const Block& block) {
+  return IsIntraFrame(frame_header_.frame_type) ? DecodeIntraModeInfo(block)
+                                                : DecodeInterModeInfo(block);
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc
new file mode 100644 (file)
index 0000000..27e5110
--- /dev/null
@@ -0,0 +1,329 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+int Tile::GetPaletteCache(const Block& block, PlaneType plane_type,
+                          uint16_t* const cache) {
+  const int top_size =
+      (block.top_available[kPlaneY] && Mod64(MultiplyBy4(block.row4x4)) != 0)
+          ? block.top_context->palette_size[plane_type][block.top_context_index]
+          : 0;
+  const int left_size =
+      block.left_available[kPlaneY]
+          ? left_context_.palette_size[plane_type][block.left_context_index]
+          : 0;
+  if (left_size == 0 && top_size == 0) return 0;
+  // Merge the left and top colors in sorted order and store them in |cache|.
+  uint16_t empty_palette[1];
+  const uint16_t* top =
+      (top_size > 0) ? block.top_context
+                           ->palette_color[block.top_context_index][plane_type]
+                     : empty_palette;
+  const uint16_t* left =
+      (left_size > 0)
+          ? left_context_.palette_color[block.left_context_index][plane_type]
+          : empty_palette;
+  std::merge(top, top + top_size, left, left + left_size, cache);
+  // Deduplicate the entries in |cache| and return the number of unique
+  // entries.
+  return static_cast<int>(
+      std::distance(cache, std::unique(cache, cache + left_size + top_size)));
+}
+
+void Tile::ReadPaletteColors(const Block& block, Plane plane) {
+  const PlaneType plane_type = GetPlaneType(plane);
+  uint16_t cache[2 * kMaxPaletteSize];
+  const int n = GetPaletteCache(block, plane_type, cache);
+  BlockParameters& bp = *block.bp;
+  const uint8_t palette_size =
+      bp.prediction_parameters->palette_mode_info.size[plane_type];
+  uint16_t* const palette_color =
+      bp.prediction_parameters->palette_mode_info.color[plane];
+  const int8_t bitdepth = sequence_header_.color_config.bitdepth;
+  int index = 0;
+  for (int i = 0; i < n && index < palette_size; ++i) {
+    if (reader_.ReadBit() != 0) {  // use_palette_color_cache.
+      palette_color[index++] = cache[i];
+    }
+  }
+  const int merge_pivot = index;
+  if (index < palette_size) {
+    palette_color[index++] =
+        static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+  }
+  const int max_value = (1 << bitdepth) - 1;
+  if (index < palette_size) {
+    int bits = bitdepth - 3 + static_cast<int>(reader_.ReadLiteral(2));
+    do {
+      const int delta = static_cast<int>(reader_.ReadLiteral(bits)) +
+                        (plane_type == kPlaneTypeY ? 1 : 0);
+      palette_color[index] =
+          std::min(palette_color[index - 1] + delta, max_value);
+      if (palette_color[index] + (plane_type == kPlaneTypeY ? 1 : 0) >=
+          max_value) {
+        // Once the color exceeds max_value, all others can be set to max_value
+        // (since they are computed as a delta on top of the current color and
+        // then clipped).
+        Memset(&palette_color[index + 1], max_value, palette_size - index - 1);
+        break;
+      }
+      const int range = (1 << bitdepth) - palette_color[index] -
+                        (plane_type == kPlaneTypeY ? 1 : 0);
+      bits = std::min(bits, CeilLog2(range));
+    } while (++index < palette_size);
+  }
+  // Palette colors are generated using two ascending arrays. So sorting them is
+  // simply a matter of merging the two sorted portions of the array.
+  std::inplace_merge(palette_color, palette_color + merge_pivot,
+                     palette_color + palette_size);
+  if (plane_type == kPlaneTypeUV) {
+    uint16_t* const palette_color_v =
+        bp.prediction_parameters->palette_mode_info.color[kPlaneV];
+    if (reader_.ReadBit() != 0) {  // delta_encode_palette_colors_v.
+      const int bits = bitdepth - 4 + static_cast<int>(reader_.ReadLiteral(2));
+      palette_color_v[0] = reader_.ReadLiteral(bitdepth);
+      for (int i = 1; i < palette_size; ++i) {
+        int delta = static_cast<int>(reader_.ReadLiteral(bits));
+        if (delta != 0 && reader_.ReadBit() != 0) delta = -delta;
+        // This line is equivalent to the following lines in the spec:
+        // val = palette_colors_v[ idx - 1 ] + palette_delta_v
+        // if ( val < 0 ) val += maxVal
+        // if ( val >= maxVal ) val -= maxVal
+        // palette_colors_v[ idx ] = Clip1( val )
+        //
+        // The difference is that in the code, max_value is (1 << bitdepth) - 1.
+        // So "& max_value" has the desired effect of computing both the "if"
+        // conditions and the Clip.
+        palette_color_v[i] = (palette_color_v[i - 1] + delta) & max_value;
+      }
+    } else {
+      for (int i = 0; i < palette_size; ++i) {
+        palette_color_v[i] =
+            static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+      }
+    }
+  }
+}
+
+void Tile::ReadPaletteModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+  if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
+      !frame_header_.allow_screen_content_tools) {
+    return;
+  }
+  const int block_size_context =
+      k4x4WidthLog2[block.size] + k4x4HeightLog2[block.size] - 2;
+  if (bp.y_mode == kPredictionModeDc) {
+    const int context =
+        static_cast<int>(
+            block.top_available[kPlaneY] &&
+            block.top_context
+                    ->palette_size[kPlaneTypeY][block.top_context_index] > 0) +
+        static_cast<int>(
+            block.left_available[kPlaneY] &&
+            left_context_.palette_size[kPlaneTypeY][block.left_context_index] >
+                0);
+    const bool has_palette_y = reader_.ReadSymbol(
+        symbol_decoder_context_.has_palette_y_cdf[block_size_context][context]);
+    if (has_palette_y) {
+      bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] =
+          kMinPaletteSize +
+          reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+              symbol_decoder_context_.palette_y_size_cdf[block_size_context]);
+      ReadPaletteColors(block, kPlaneY);
+    }
+  }
+  if (block.HasChroma() &&
+      bp.prediction_parameters->uv_mode == kPredictionModeDc) {
+    const int context = static_cast<int>(
+        bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] > 0);
+    const bool has_palette_uv =
+        reader_.ReadSymbol(symbol_decoder_context_.has_palette_uv_cdf[context]);
+    if (has_palette_uv) {
+      bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] =
+          kMinPaletteSize +
+          reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+              symbol_decoder_context_.palette_uv_size_cdf[block_size_context]);
+      ReadPaletteColors(block, kPlaneU);
+    }
+  }
+}
+
+void Tile::PopulatePaletteColorContexts(
+    const Block& block, PlaneType plane_type, int i, int start, int end,
+    uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+    uint8_t color_context[kMaxPaletteSquare]) {
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int column = start, counter = 0; column >= end; --column, ++counter) {
+    const int row = i - column;
+    assert(row > 0 || column > 0);
+    const uint8_t top =
+        (row > 0)
+            ? prediction_parameters.color_index_map[plane_type][row - 1][column]
+            : 0;
+    const uint8_t left =
+        (column > 0)
+            ? prediction_parameters.color_index_map[plane_type][row][column - 1]
+            : 0;
+    uint8_t index_mask;
+    static_assert(kMaxPaletteSize <= 8, "");
+    int index;
+    if (column <= 0) {
+      color_context[counter] = 0;
+      color_order[counter][0] = top;
+      index_mask = 1 << top;
+      index = 1;
+    } else if (row <= 0) {
+      color_context[counter] = 0;
+      color_order[counter][0] = left;
+      index_mask = 1 << left;
+      index = 1;
+    } else {
+      const uint8_t top_left =
+          prediction_parameters
+              .color_index_map[plane_type][row - 1][column - 1];
+      index_mask = (1 << top) | (1 << left) | (1 << top_left);
+      if (top == left && top == top_left) {
+        color_context[counter] = 4;
+        color_order[counter][0] = top;
+        index = 1;
+      } else if (top == left) {
+        color_context[counter] = 3;
+        color_order[counter][0] = top;
+        color_order[counter][1] = top_left;
+        index = 2;
+      } else if (top == top_left) {
+        color_context[counter] = 2;
+        color_order[counter][0] = top_left;
+        color_order[counter][1] = left;
+        index = 2;
+      } else if (left == top_left) {
+        color_context[counter] = 2;
+        color_order[counter][0] = top_left;
+        color_order[counter][1] = top;
+        index = 2;
+      } else {
+        color_context[counter] = 1;
+        color_order[counter][0] = std::min(top, left);
+        color_order[counter][1] = std::max(top, left);
+        color_order[counter][2] = top_left;
+        index = 3;
+      }
+    }
+    // Even though only the first |palette_size| entries of this array are ever
+    // used, it is faster to populate all 8 because of the vectorization of the
+    // constant sized loop.
+    for (uint8_t j = 0; j < kMaxPaletteSize; ++j) {
+      if (BitMaskSet::MaskContainsValue(index_mask, j)) continue;
+      color_order[counter][index++] = j;
+    }
+  }
+}
+
+bool Tile::ReadPaletteTokens(const Block& block) {
+  const PaletteModeInfo& palette_mode_info =
+      block.bp->prediction_parameters->palette_mode_info;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int plane_type = kPlaneTypeY;
+       plane_type < (block.HasChroma() ? kNumPlaneTypes : kPlaneTypeUV);
+       ++plane_type) {
+    const int palette_size = palette_mode_info.size[plane_type];
+    if (palette_size == 0) continue;
+    int block_height = block.height;
+    int block_width = block.width;
+    int screen_height = std::min(
+        block_height, MultiplyBy4(frame_header_.rows4x4 - block.row4x4));
+    int screen_width = std::min(
+        block_width, MultiplyBy4(frame_header_.columns4x4 - block.column4x4));
+    if (plane_type == kPlaneTypeUV) {
+      block_height >>= sequence_header_.color_config.subsampling_y;
+      block_width >>= sequence_header_.color_config.subsampling_x;
+      screen_height >>= sequence_header_.color_config.subsampling_y;
+      screen_width >>= sequence_header_.color_config.subsampling_x;
+      if (block_height < 4) {
+        block_height += 2;
+        screen_height += 2;
+      }
+      if (block_width < 4) {
+        block_width += 2;
+        screen_width += 2;
+      }
+    }
+    if (!prediction_parameters.color_index_map[plane_type].Reset(
+            block_height, block_width, /*zero_initialize=*/false)) {
+      return false;
+    }
+    int first_value = 0;
+    reader_.DecodeUniform(palette_size, &first_value);
+    prediction_parameters.color_index_map[plane_type][0][0] = first_value;
+    for (int i = 1; i < screen_height + screen_width - 1; ++i) {
+      const int start = std::min(i, screen_width - 1);
+      const int end = std::max(0, i - screen_height + 1);
+      uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize];
+      uint8_t color_context[kMaxPaletteSquare];
+      PopulatePaletteColorContexts(block, static_cast<PlaneType>(plane_type), i,
+                                   start, end, color_order, color_context);
+      for (int j = start, counter = 0; j >= end; --j, ++counter) {
+        uint16_t* const cdf =
+            symbol_decoder_context_
+                .palette_color_index_cdf[plane_type]
+                                        [palette_size - kMinPaletteSize]
+                                        [color_context[counter]];
+        const int color_order_index = reader_.ReadSymbol(cdf, palette_size);
+        prediction_parameters.color_index_map[plane_type][i - j][j] =
+            color_order[counter][color_order_index];
+      }
+    }
+    if (screen_width < block_width) {
+      for (int i = 0; i < screen_height; ++i) {
+        memset(
+            &prediction_parameters.color_index_map[plane_type][i][screen_width],
+            prediction_parameters
+                .color_index_map[plane_type][i][screen_width - 1],
+            block_width - screen_width);
+      }
+    }
+    for (int i = screen_height; i < block_height; ++i) {
+      memcpy(
+          prediction_parameters.color_index_map[plane_type][i],
+          prediction_parameters.color_index_map[plane_type][screen_height - 1],
+          block_width);
+    }
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/partition.cc b/src/tile/bitstream/partition.cc
new file mode 100644 (file)
index 0000000..f3dbbb0
--- /dev/null
@@ -0,0 +1,148 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+uint16_t PartitionCdfGatherHorizontalAlike(const uint16_t* const partition_cdf,
+                                           BlockSize block_size) {
+  // The spec computes the cdf value using the following formula (not writing
+  // partition_cdf[] and using short forms for partition names for clarity):
+  //   cdf = None - H + V - S + S - HTS + HTS - HBS + HBS - VLS;
+  //   if (block_size != 128x128) {
+  //     cdf += VRS - H4;
+  //   }
+  // After canceling out the repeated terms with opposite signs, we have:
+  //   cdf = None - H + V - VLS;
+  //   if (block_size != 128x128) {
+  //     cdf += VRS - H4;
+  //   }
+  uint16_t cdf = partition_cdf[kPartitionNone] -
+                 partition_cdf[kPartitionHorizontal] +
+                 partition_cdf[kPartitionVertical] -
+                 partition_cdf[kPartitionVerticalWithLeftSplit];
+  if (block_size != kBlock128x128) {
+    cdf += partition_cdf[kPartitionVerticalWithRightSplit] -
+           partition_cdf[kPartitionHorizontal4];
+  }
+  return cdf;
+}
+
+uint16_t PartitionCdfGatherVerticalAlike(const uint16_t* const partition_cdf,
+                                         BlockSize block_size) {
+  // The spec computes the cdf value using the following formula (not writing
+  // partition_cdf[] and using short forms for partition names for clarity):
+  //   cdf = H - V + V - S + HBS - VLS + VLS - VRS + S - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4 - V4;
+  //   }
+  // V4 is always zero. So, after canceling out the repeated terms with opposite
+  // signs, we have:
+  //   cdf = H + HBS - VRS - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4;
+  //   }
+  // VRS is zero for 128x128 blocks. So, further simplifying we have:
+  //   cdf = H + HBS - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4 - VRS;
+  //   }
+  uint16_t cdf = partition_cdf[kPartitionHorizontal] +
+                 partition_cdf[kPartitionHorizontalWithBottomSplit] -
+                 partition_cdf[kPartitionHorizontalWithTopSplit];
+  if (block_size != kBlock128x128) {
+    cdf += partition_cdf[kPartitionHorizontal4] -
+           partition_cdf[kPartitionVerticalWithRightSplit];
+  }
+  return cdf;
+}
+
+}  // namespace
+
+uint16_t* Tile::GetPartitionCdf(int row4x4, int column4x4,
+                                BlockSize block_size) {
+  const int block_size_log2 = k4x4WidthLog2[block_size];
+  int top = 0;
+  if (IsTopInside(row4x4)) {
+    top = static_cast<int>(
+        k4x4WidthLog2[block_parameters_holder_.Find(row4x4 - 1, column4x4)
+                          ->size] < block_size_log2);
+  }
+  int left = 0;
+  if (IsLeftInside(column4x4)) {
+    left = static_cast<int>(
+        k4x4HeightLog2[block_parameters_holder_.Find(row4x4, column4x4 - 1)
+                           ->size] < block_size_log2);
+  }
+  const int context = left * 2 + top;
+  return symbol_decoder_context_.partition_cdf[block_size_log2 - 1][context];
+}
+
+bool Tile::ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+                         bool has_rows, bool has_columns,
+                         Partition* const partition) {
+  if (IsBlockSmallerThan8x8(block_size)) {
+    *partition = kPartitionNone;
+    return true;
+  }
+  if (!has_rows && !has_columns) {
+    *partition = kPartitionSplit;
+    return true;
+  }
+  uint16_t* const partition_cdf =
+      GetPartitionCdf(row4x4, column4x4, block_size);
+  if (partition_cdf == nullptr) {
+    return false;
+  }
+  if (has_rows && has_columns) {
+    const int bsize_log2 = k4x4WidthLog2[block_size];
+    // The partition block size should be 8x8 or above.
+    assert(bsize_log2 > 0);
+    if (bsize_log2 == 1) {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kPartitionSplit + 1>(partition_cdf));
+    } else if (bsize_log2 == 5) {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kPartitionVerticalWithRightSplit + 1>(
+              partition_cdf));
+    } else {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kMaxPartitionTypes>(partition_cdf));
+    }
+  } else if (has_columns) {
+    const uint16_t cdf =
+        PartitionCdfGatherVerticalAlike(partition_cdf, block_size);
+    *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+                                                         : kPartitionHorizontal;
+  } else {
+    const uint16_t cdf =
+        PartitionCdfGatherHorizontalAlike(partition_cdf, block_size);
+    *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+                                                         : kPartitionVertical;
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/transform_size.cc b/src/tile/bitstream/transform_size.cc
new file mode 100644 (file)
index 0000000..7197400
--- /dev/null
@@ -0,0 +1,222 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t kMaxVariableTransformTreeDepth = 2;
+// Max_Tx_Depth array from section 5.11.5 in the spec with the following
+// modification: If the element is not zero, it is subtracted by one. That is
+// the only way in which this array is being used.
+constexpr int kTxDepthCdfIndex[kMaxBlockSizes] = {
+    0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3};
+
+constexpr TransformSize kMaxTransformSizeRectangle[kMaxBlockSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+    kTransformSize64x64};
+
+TransformSize GetSquareTransformSize(uint8_t pixels) {
+  switch (pixels) {
+    case 128:
+    case 64:
+      return kTransformSize64x64;
+    case 32:
+      return kTransformSize32x32;
+    case 16:
+      return kTransformSize16x16;
+    case 8:
+      return kTransformSize8x8;
+    default:
+      return kTransformSize4x4;
+  }
+}
+
+}  // namespace
+
+int Tile::GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+                               bool ignore_skip) {
+  if (row4x4 == block.row4x4) {
+    if (!block.top_available[kPlaneY]) return 64;
+    const BlockParameters& bp_top =
+        *block_parameters_holder_.Find(row4x4 - 1, column4x4);
+    if ((ignore_skip || bp_top.skip) && bp_top.is_inter) {
+      return kBlockWidthPixels[bp_top.size];
+    }
+  }
+  return kTransformWidth[inter_transform_sizes_[row4x4 - 1][column4x4]];
+}
+
+int Tile::GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+                                 bool ignore_skip) {
+  if (column4x4 == block.column4x4) {
+    if (!block.left_available[kPlaneY]) return 64;
+    const BlockParameters& bp_left =
+        *block_parameters_holder_.Find(row4x4, column4x4 - 1);
+    if ((ignore_skip || bp_left.skip) && bp_left.is_inter) {
+      return kBlockHeightPixels[bp_left.size];
+    }
+  }
+  return kTransformHeight[inter_transform_sizes_[row4x4][column4x4 - 1]];
+}
+
+TransformSize Tile::ReadFixedTransformSize(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.segmentation
+          .lossless[bp.prediction_parameters->segment_id]) {
+    return kTransformSize4x4;
+  }
+  const TransformSize max_rect_tx_size = kMaxTransformSizeRectangle[block.size];
+  const bool allow_select = !bp.skip || !bp.is_inter;
+  if (block.size == kBlock4x4 || !allow_select ||
+      frame_header_.tx_mode != kTxModeSelect) {
+    return max_rect_tx_size;
+  }
+  const int max_tx_width = kTransformWidth[max_rect_tx_size];
+  const int max_tx_height = kTransformHeight[max_rect_tx_size];
+  const int top_width =
+      block.top_available[kPlaneY]
+          ? GetTopTransformWidth(block, block.row4x4, block.column4x4, true)
+          : 0;
+  const int left_height =
+      block.left_available[kPlaneY]
+          ? GetLeftTransformHeight(block, block.row4x4, block.column4x4, true)
+          : 0;
+  const auto context = static_cast<int>(top_width >= max_tx_width) +
+                       static_cast<int>(left_height >= max_tx_height);
+  const int cdf_index = kTxDepthCdfIndex[block.size];
+  uint16_t* const cdf =
+      symbol_decoder_context_.tx_depth_cdf[cdf_index][context];
+  const int tx_depth = (cdf_index == 0)
+                           ? static_cast<int>(reader_.ReadSymbol(cdf))
+                           : reader_.ReadSymbol<3>(cdf);
+  assert(tx_depth < 3);
+  TransformSize tx_size = max_rect_tx_size;
+  if (tx_depth == 0) return tx_size;
+  tx_size = kSplitTransformSize[tx_size];
+  if (tx_depth == 1) return tx_size;
+  return kSplitTransformSize[tx_size];
+}
+
+void Tile::ReadVariableTransformTree(const Block& block, int row4x4,
+                                     int column4x4, TransformSize tx_size) {
+  const uint8_t pixels = std::max(block.width, block.height);
+  const TransformSize max_tx_size = GetSquareTransformSize(pixels);
+  const int context_delta = (kNumSquareTransformSizes - 1 -
+                             TransformSizeToSquareTransformIndex(max_tx_size)) *
+                            6;
+
+  // Branching factor is 4 and maximum depth is 2. So the maximum stack size
+  // necessary is (4 - 1) + 4 = 7.
+  Stack<TransformTreeNode, 7> stack;
+  stack.Push(TransformTreeNode(column4x4, row4x4, tx_size, 0));
+
+  do {
+    TransformTreeNode node = stack.Pop();
+    const int tx_width4x4 = kTransformWidth4x4[node.tx_size];
+    const int tx_height4x4 = kTransformHeight4x4[node.tx_size];
+    if (node.tx_size != kTransformSize4x4 &&
+        node.depth != kMaxVariableTransformTreeDepth) {
+      const auto top =
+          static_cast<int>(GetTopTransformWidth(block, node.y, node.x, false) <
+                           kTransformWidth[node.tx_size]);
+      const auto left = static_cast<int>(
+          GetLeftTransformHeight(block, node.y, node.x, false) <
+          kTransformHeight[node.tx_size]);
+      const int context =
+          static_cast<int>(max_tx_size > kTransformSize8x8 &&
+                           kTransformSizeSquareMax[node.tx_size] !=
+                               max_tx_size) *
+              3 +
+          context_delta + top + left;
+      // tx_split.
+      if (reader_.ReadSymbol(symbol_decoder_context_.tx_split_cdf[context])) {
+        const TransformSize sub_tx_size = kSplitTransformSize[node.tx_size];
+        const int step_width4x4 = kTransformWidth4x4[sub_tx_size];
+        const int step_height4x4 = kTransformHeight4x4[sub_tx_size];
+        // The loops have to run in reverse order because we use a stack for
+        // DFS.
+        for (int i = tx_height4x4 - step_height4x4; i >= 0;
+             i -= step_height4x4) {
+          for (int j = tx_width4x4 - step_width4x4; j >= 0;
+               j -= step_width4x4) {
+            if (node.y + i >= frame_header_.rows4x4 ||
+                node.x + j >= frame_header_.columns4x4) {
+              continue;
+            }
+            stack.Push(TransformTreeNode(node.x + j, node.y + i, sub_tx_size,
+                                         node.depth + 1));
+          }
+        }
+        continue;
+      }
+    }
+    // tx_split is false.
+    for (int i = 0; i < tx_height4x4; ++i) {
+      static_assert(sizeof(TransformSize) == 1, "");
+      memset(&inter_transform_sizes_[node.y + i][node.x], node.tx_size,
+             tx_width4x4);
+    }
+  } while (!stack.Empty());
+}
+
+void Tile::DecodeTransformSize(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.tx_mode == kTxModeSelect && block.size > kBlock4x4 &&
+      bp.is_inter && !bp.skip &&
+      !frame_header_.segmentation
+           .lossless[bp.prediction_parameters->segment_id]) {
+    const TransformSize max_tx_size = kMaxTransformSizeRectangle[block.size];
+    const int tx_width4x4 = kTransformWidth4x4[max_tx_size];
+    const int tx_height4x4 = kTransformHeight4x4[max_tx_size];
+    for (int row = block.row4x4; row < block.row4x4 + block.height4x4;
+         row += tx_height4x4) {
+      for (int column = block.column4x4;
+           column < block.column4x4 + block.width4x4; column += tx_width4x4) {
+        ReadVariableTransformTree(block, row, column, max_tx_size);
+      }
+    }
+  } else {
+    const TransformSize transform_size = ReadFixedTransformSize(block);
+    for (int row = block.row4x4; row < block.row4x4 + block.height4x4; ++row) {
+      static_assert(sizeof(TransformSize) == 1, "");
+      memset(&inter_transform_sizes_[row][block.column4x4], transform_size,
+             block.width4x4);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/tile/prediction.cc b/src/tile/prediction.cc
new file mode 100644 (file)
index 0000000..4348548
--- /dev/null
@@ -0,0 +1,1341 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/warp_prediction.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/inter_intra_masks.inc"
+
+// Precision bits when scaling reference frames.
+constexpr int kReferenceScaleShift = 14;
+constexpr int kAngleStep = 3;
+constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = {
+    0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0};
+
+// The following modes need both the left_column and top_row for intra
+// prediction. For directional modes left/top requirement is inferred based on
+// the prediction angle. For Dc modes, left/top requirement is inferred based on
+// whether or not left/top is available.
+constexpr BitMaskSet kNeedsLeftAndTop(kPredictionModeSmooth,
+                                      kPredictionModeSmoothHorizontal,
+                                      kPredictionModeSmoothVertical,
+                                      kPredictionModePaeth);
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+  assert(angle >= 3);
+  assert(angle <= 87);
+  return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+// Maps the block_size to an index as follows:
+//  kBlock8x8 => 0.
+//  kBlock8x16 => 1.
+//  kBlock8x32 => 2.
+//  kBlock16x8 => 3.
+//  kBlock16x16 => 4.
+//  kBlock16x32 => 5.
+//  kBlock32x8 => 6.
+//  kBlock32x16 => 7.
+//  kBlock32x32 => 8.
+int GetWedgeBlockSizeIndex(BlockSize block_size) {
+  assert(block_size >= kBlock8x8);
+  return block_size - kBlock8x8 - static_cast<int>(block_size >= kBlock16x8) -
+         static_cast<int>(block_size >= kBlock32x8);
+}
+
+// Maps a dimension of 4, 8, 16 and 32 to indices 0, 1, 2 and 3 respectively.
+int GetInterIntraMaskLookupIndex(int dimension) {
+  assert(dimension == 4 || dimension == 8 || dimension == 16 ||
+         dimension == 32);
+  return FloorLog2(dimension) - 2;
+}
+
+// 7.11.2.9.
+int GetIntraEdgeFilterStrength(int width, int height, int filter_type,
+                               int delta) {
+  const int sum = width + height;
+  delta = std::abs(delta);
+  if (filter_type == 0) {
+    if (sum <= 8) {
+      if (delta >= 56) return 1;
+    } else if (sum <= 16) {
+      if (delta >= 40) return 1;
+    } else if (sum <= 24) {
+      if (delta >= 32) return 3;
+      if (delta >= 16) return 2;
+      if (delta >= 8) return 1;
+    } else if (sum <= 32) {
+      if (delta >= 32) return 3;
+      if (delta >= 4) return 2;
+      return 1;
+    } else {
+      return 3;
+    }
+  } else {
+    if (sum <= 8) {
+      if (delta >= 64) return 2;
+      if (delta >= 40) return 1;
+    } else if (sum <= 16) {
+      if (delta >= 48) return 2;
+      if (delta >= 20) return 1;
+    } else if (sum <= 24) {
+      if (delta >= 4) return 3;
+    } else {
+      return 3;
+    }
+  }
+  return 0;
+}
+
+// 7.11.2.10.
+bool DoIntraEdgeUpsampling(int width, int height, int filter_type, int delta) {
+  const int sum = width + height;
+  delta = std::abs(delta);
+  // This function should not be called when the prediction angle is 90 or 180.
+  assert(delta != 0);
+  if (delta >= 40) return false;
+  return (filter_type == 1) ? sum <= 8 : sum <= 16;
+}
+
+constexpr uint8_t kQuantizedDistanceWeight[4][2] = {
+    {2, 3}, {2, 5}, {2, 7}, {1, kMaxFrameDistance}};
+
+constexpr uint8_t kQuantizedDistanceLookup[4][2] = {
+    {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+void GetDistanceWeights(const int distance[2], int weight[2]) {
+  // Note: distance[0] and distance[1] correspond to relative distance
+  // between current frame and reference frame [1] and [0], respectively.
+  const int order = static_cast<int>(distance[0] <= distance[1]);
+  if (distance[0] == 0 || distance[1] == 0) {
+    weight[0] = kQuantizedDistanceLookup[3][order];
+    weight[1] = kQuantizedDistanceLookup[3][1 - order];
+  } else {
+    int i;
+    for (i = 0; i < 3; ++i) {
+      const int weight_0 = kQuantizedDistanceWeight[i][order];
+      const int weight_1 = kQuantizedDistanceWeight[i][1 - order];
+      if (order == 0) {
+        if (distance[0] * weight_0 < distance[1] * weight_1) break;
+      } else {
+        if (distance[0] * weight_0 > distance[1] * weight_1) break;
+      }
+    }
+    weight[0] = kQuantizedDistanceLookup[i][order];
+    weight[1] = kQuantizedDistanceLookup[i][1 - order];
+  }
+}
+
+dsp::IntraPredictor GetIntraPredictor(PredictionMode mode, bool has_left,
+                                      bool has_top) {
+  if (mode == kPredictionModeDc) {
+    if (has_left && has_top) {
+      return dsp::kIntraPredictorDc;
+    }
+    if (has_left) {
+      return dsp::kIntraPredictorDcLeft;
+    }
+    if (has_top) {
+      return dsp::kIntraPredictorDcTop;
+    }
+    return dsp::kIntraPredictorDcFill;
+  }
+  switch (mode) {
+    case kPredictionModePaeth:
+      return dsp::kIntraPredictorPaeth;
+    case kPredictionModeSmooth:
+      return dsp::kIntraPredictorSmooth;
+    case kPredictionModeSmoothVertical:
+      return dsp::kIntraPredictorSmoothVertical;
+    case kPredictionModeSmoothHorizontal:
+      return dsp::kIntraPredictorSmoothHorizontal;
+    default:
+      return dsp::kNumIntraPredictors;
+  }
+}
+
+uint8_t* GetStartPoint(Array2DView<uint8_t>* const buffer, const int plane,
+                       const int x, const int y, const int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) {
+    Array2DView<uint16_t> buffer16(
+        buffer[plane].rows(), buffer[plane].columns() / sizeof(uint16_t),
+        reinterpret_cast<uint16_t*>(&buffer[plane][0][0]));
+    return reinterpret_cast<uint8_t*>(&buffer16[y][x]);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+  static_cast<void>(bitdepth);
+  return &buffer[plane][y][x];
+}
+
+int GetPixelPositionFromHighScale(int start, int step, int offset) {
+  return (start + step * offset) >> kScaleSubPixelBits;
+}
+
+dsp::MaskBlendFunc GetMaskBlendFunc(const dsp::Dsp& dsp, bool is_inter_intra,
+                                    bool is_wedge_inter_intra,
+                                    int subsampling_x, int subsampling_y) {
+  return (is_inter_intra && !is_wedge_inter_intra)
+             ? dsp.mask_blend[0][/*is_inter_intra=*/true]
+             : dsp.mask_blend[subsampling_x + subsampling_y][is_inter_intra];
+}
+
+}  // namespace
+
+template <typename Pixel>
+void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
+                           bool has_left, bool has_top, bool has_top_right,
+                           bool has_bottom_left, PredictionMode mode,
+                           TransformSize tx_size) {
+  const int width = kTransformWidth[tx_size];
+  const int height = kTransformHeight[tx_size];
+  const int x_shift = subsampling_x_[plane];
+  const int y_shift = subsampling_y_[plane];
+  const int max_x = (MultiplyBy4(frame_header_.columns4x4) >> x_shift) - 1;
+  const int max_y = (MultiplyBy4(frame_header_.rows4x4) >> y_shift) - 1;
+  // For performance reasons, do not initialize the following two buffers.
+  alignas(kMaxAlignment) Pixel top_row_data[160];
+  alignas(kMaxAlignment) Pixel left_column_data[160];
+#if LIBGAV1_MSAN
+  if (IsDirectionalMode(mode)) {
+    memset(top_row_data, 0, sizeof(top_row_data));
+    memset(left_column_data, 0, sizeof(left_column_data));
+  }
+#endif
+  // Some predictors use |top_row_data| and |left_column_data| with a negative
+  // offset to access pixels to the top-left of the current block. So have some
+  // space before the arrays to allow populating those without having to move
+  // the rest of the array.
+  Pixel* const top_row = top_row_data + 16;
+  Pixel* const left_column = left_column_data + 16;
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const int top_and_left_size = width + height;
+  const bool is_directional_mode = IsDirectionalMode(mode);
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  const bool use_filter_intra =
+      (plane == kPlaneY && prediction_parameters.use_filter_intra);
+  const int prediction_angle =
+      is_directional_mode
+          ? kPredictionModeToAngle[mode] +
+                prediction_parameters.angle_delta[GetPlaneType(plane)] *
+                    kAngleStep
+          : 0;
+  // Directional prediction requires buffers larger than the width or height.
+  const int top_size = is_directional_mode ? top_and_left_size : width;
+  const int left_size = is_directional_mode ? top_and_left_size : height;
+  const int top_right_size =
+      is_directional_mode ? (has_top_right ? 2 : 1) * width : width;
+  const int bottom_left_size =
+      is_directional_mode ? (has_bottom_left ? 2 : 1) * height : height;
+
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  const bool needs_top = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+                         (is_directional_mode && prediction_angle < 180) ||
+                         (mode == kPredictionModeDc && has_top);
+  const bool needs_left = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+                          (is_directional_mode && prediction_angle > 90) ||
+                          (mode == kPredictionModeDc && has_left);
+
+  const Pixel* top_row_src = buffer[y - 1];
+
+  // Determine if we need to retrieve the top row from
+  // |intra_prediction_buffer_|.
+  if ((needs_top || needs_left) && use_intra_prediction_buffer_) {
+    // Superblock index of block.row4x4. block.row4x4 is always in luma
+    // dimension (no subsampling).
+    const int current_superblock_index =
+        block.row4x4 >> (sequence_header_.use_128x128_superblock ? 5 : 4);
+    // Superblock index of y - 1. y is in the plane dimension (chroma planes
+    // could be subsampled).
+    const int plane_shift = (sequence_header_.use_128x128_superblock ? 7 : 6) -
+                            subsampling_y_[plane];
+    const int top_row_superblock_index = (y - 1) >> plane_shift;
+    // If the superblock index of y - 1 is not that of the current superblock,
+    // then we will have to retrieve the top row from the
+    // |intra_prediction_buffer_|.
+    if (current_superblock_index != top_row_superblock_index) {
+      top_row_src = reinterpret_cast<const Pixel*>(
+          (*intra_prediction_buffer_)[plane].get());
+    }
+  }
+
+  if (needs_top) {
+    // Compute top_row.
+    if (has_top || has_left) {
+      const int left_index = has_left ? x - 1 : x;
+      top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index];
+    } else {
+      top_row[-1] = 1 << (bitdepth - 1);
+    }
+    if (!has_top && has_left) {
+      Memset(top_row, buffer[y][x - 1], top_size);
+    } else if (!has_top && !has_left) {
+      Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size);
+    } else {
+      const int top_limit = std::min(max_x - x + 1, top_right_size);
+      memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel));
+      // Even though it is safe to call Memset with a size of 0, accessing
+      // top_row_src[top_limit - x + 1] is not allowed when this condition is
+      // false.
+      if (top_size - top_limit > 0) {
+        Memset(top_row + top_limit, top_row_src[top_limit + x - 1],
+               top_size - top_limit);
+      }
+    }
+  }
+  if (needs_left) {
+    // Compute left_column.
+    if (has_top || has_left) {
+      const int left_index = has_left ? x - 1 : x;
+      left_column[-1] =
+          has_top ? top_row_src[left_index] : buffer[y][left_index];
+    } else {
+      left_column[-1] = 1 << (bitdepth - 1);
+    }
+    if (!has_left && has_top) {
+      Memset(left_column, top_row_src[x], left_size);
+    } else if (!has_left && !has_top) {
+      Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size);
+    } else {
+      const int left_limit = std::min(max_y - y + 1, bottom_left_size);
+      for (int i = 0; i < left_limit; ++i) {
+        left_column[i] = buffer[y + i][x - 1];
+      }
+      // Even though it is safe to call Memset with a size of 0, accessing
+      // buffer[left_limit - y + 1][x - 1] is not allowed when this condition is
+      // false.
+      if (left_size - left_limit > 0) {
+        Memset(left_column + left_limit, buffer[left_limit + y - 1][x - 1],
+               left_size - left_limit);
+      }
+    }
+  }
+  Pixel* const dest = &buffer[y][x];
+  const ptrdiff_t dest_stride = buffer_[plane].columns();
+  if (use_filter_intra) {
+    dsp_.filter_intra_predictor(dest, dest_stride, top_row, left_column,
+                                prediction_parameters.filter_intra_mode, width,
+                                height);
+  } else if (is_directional_mode) {
+    DirectionalPrediction(block, plane, x, y, has_left, has_top, needs_left,
+                          needs_top, prediction_angle, width, height, max_x,
+                          max_y, tx_size, top_row, left_column);
+  } else {
+    const dsp::IntraPredictor predictor =
+        GetIntraPredictor(mode, has_left, has_top);
+    assert(predictor != dsp::kNumIntraPredictors);
+    dsp_.intra_predictors[tx_size][predictor](dest, dest_stride, top_row,
+                                              left_column);
+  }
+}
+
+template void Tile::IntraPrediction<uint8_t>(const Block& block, Plane plane,
+                                             int x, int y, bool has_left,
+                                             bool has_top, bool has_top_right,
+                                             bool has_bottom_left,
+                                             PredictionMode mode,
+                                             TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::IntraPrediction<uint16_t>(const Block& block, Plane plane,
+                                              int x, int y, bool has_left,
+                                              bool has_top, bool has_top_right,
+                                              bool has_bottom_left,
+                                              PredictionMode mode,
+                                              TransformSize tx_size);
+#endif
+
+int Tile::GetIntraEdgeFilterType(const Block& block, Plane plane) const {
+  bool top;
+  bool left;
+  if (plane == kPlaneY) {
+    top = block.top_available[kPlaneY] &&
+          kPredictionModeSmoothMask.Contains(block.bp_top->y_mode);
+    left = block.left_available[kPlaneY] &&
+           kPredictionModeSmoothMask.Contains(block.bp_left->y_mode);
+  } else {
+    top = block.top_available[plane] &&
+          block.bp->prediction_parameters->chroma_top_uses_smooth_prediction;
+    left = block.left_available[plane] &&
+           block.bp->prediction_parameters->chroma_left_uses_smooth_prediction;
+  }
+  return static_cast<int>(top || left);
+}
+
+template <typename Pixel>
+void Tile::DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+                                 bool has_left, bool has_top, bool needs_left,
+                                 bool needs_top, int prediction_angle,
+                                 int width, int height, int max_x, int max_y,
+                                 TransformSize tx_size, Pixel* const top_row,
+                                 Pixel* const left_column) {
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  Pixel* const dest = &buffer[y][x];
+  const ptrdiff_t stride = buffer_[plane].columns();
+  if (prediction_angle == 90) {
+    dsp_.intra_predictors[tx_size][dsp::kIntraPredictorVertical](
+        dest, stride, top_row, left_column);
+    return;
+  }
+  if (prediction_angle == 180) {
+    dsp_.intra_predictors[tx_size][dsp::kIntraPredictorHorizontal](
+        dest, stride, top_row, left_column);
+    return;
+  }
+
+  bool upsampled_top = false;
+  bool upsampled_left = false;
+  if (sequence_header_.enable_intra_edge_filter) {
+    const int filter_type = GetIntraEdgeFilterType(block, plane);
+    if (prediction_angle > 90 && prediction_angle < 180 &&
+        (width + height) >= 24) {
+      // 7.11.2.7.
+      left_column[-1] = top_row[-1] = RightShiftWithRounding(
+          left_column[0] * 5 + top_row[-1] * 6 + top_row[0] * 5, 4);
+    }
+    if (has_top && needs_top) {
+      const int strength = GetIntraEdgeFilterStrength(
+          width, height, filter_type, prediction_angle - 90);
+      if (strength > 0) {
+        const int num_pixels = std::min(width, max_x - x + 1) +
+                               ((prediction_angle < 90) ? height : 0) + 1;
+        dsp_.intra_edge_filter(top_row - 1, num_pixels, strength);
+      }
+    }
+    if (has_left && needs_left) {
+      const int strength = GetIntraEdgeFilterStrength(
+          width, height, filter_type, prediction_angle - 180);
+      if (strength > 0) {
+        const int num_pixels = std::min(height, max_y - y + 1) +
+                               ((prediction_angle > 180) ? width : 0) + 1;
+        dsp_.intra_edge_filter(left_column - 1, num_pixels, strength);
+      }
+    }
+    upsampled_top = DoIntraEdgeUpsampling(width, height, filter_type,
+                                          prediction_angle - 90);
+    if (upsampled_top && needs_top) {
+      const int num_pixels = width + ((prediction_angle < 90) ? height : 0);
+      dsp_.intra_edge_upsampler(top_row, num_pixels);
+    }
+    upsampled_left = DoIntraEdgeUpsampling(width, height, filter_type,
+                                           prediction_angle - 180);
+    if (upsampled_left && needs_left) {
+      const int num_pixels = height + ((prediction_angle > 180) ? width : 0);
+      dsp_.intra_edge_upsampler(left_column, num_pixels);
+    }
+  }
+
+  if (prediction_angle < 90) {
+    const int dx = GetDirectionalIntraPredictorDerivative(prediction_angle);
+    dsp_.directional_intra_predictor_zone1(dest, stride, top_row, width, height,
+                                           dx, upsampled_top);
+  } else if (prediction_angle < 180) {
+    const int dx =
+        GetDirectionalIntraPredictorDerivative(180 - prediction_angle);
+    const int dy =
+        GetDirectionalIntraPredictorDerivative(prediction_angle - 90);
+    dsp_.directional_intra_predictor_zone2(dest, stride, top_row, left_column,
+                                           width, height, dx, dy, upsampled_top,
+                                           upsampled_left);
+  } else {
+    assert(prediction_angle < 270);
+    const int dy =
+        GetDirectionalIntraPredictorDerivative(270 - prediction_angle);
+    dsp_.directional_intra_predictor_zone3(dest, stride, left_column, width,
+                                           height, dy, upsampled_left);
+  }
+}
+
+template <typename Pixel>
+void Tile::PalettePrediction(const Block& block, const Plane plane,
+                             const int start_x, const int start_y, const int x,
+                             const int y, const TransformSize tx_size) {
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const uint16_t* const palette =
+      block.bp->prediction_parameters->palette_mode_info.color[plane];
+  const PlaneType plane_type = GetPlaneType(plane);
+  const int x4 = MultiplyBy4(x);
+  const int y4 = MultiplyBy4(y);
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  for (int row = 0; row < tx_height; ++row) {
+    assert(block.bp->prediction_parameters
+               ->color_index_map[plane_type][y4 + row] != nullptr);
+    for (int column = 0; column < tx_width; ++column) {
+      buffer[start_y + row][start_x + column] =
+          palette[block.bp->prediction_parameters
+                      ->color_index_map[plane_type][y4 + row][x4 + column]];
+    }
+  }
+}
+
+template void Tile::PalettePrediction<uint8_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const int x, const int y, const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::PalettePrediction<uint16_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const int x, const int y, const TransformSize tx_size);
+#endif
+
+template <typename Pixel>
+void Tile::ChromaFromLumaPrediction(const Block& block, const Plane plane,
+                                    const int start_x, const int start_y,
+                                    const TransformSize tx_size) {
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  Array2DView<Pixel> y_buffer(
+      buffer_[kPlaneY].rows(), buffer_[kPlaneY].columns() / sizeof(Pixel),
+      reinterpret_cast<Pixel*>(&buffer_[kPlaneY][0][0]));
+  if (!block.scratch_buffer->cfl_luma_buffer_valid) {
+    const int luma_x = start_x << subsampling_x;
+    const int luma_y = start_y << subsampling_y;
+    dsp_.cfl_subsamplers[tx_size][subsampling_x + subsampling_y](
+        block.scratch_buffer->cfl_luma_buffer,
+        prediction_parameters.max_luma_width - luma_x,
+        prediction_parameters.max_luma_height - luma_y,
+        reinterpret_cast<uint8_t*>(&y_buffer[luma_y][luma_x]),
+        buffer_[kPlaneY].columns());
+    block.scratch_buffer->cfl_luma_buffer_valid = true;
+  }
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  dsp_.cfl_intra_predictors[tx_size](
+      reinterpret_cast<uint8_t*>(&buffer[start_y][start_x]),
+      buffer_[plane].columns(), block.scratch_buffer->cfl_luma_buffer,
+      (plane == kPlaneU) ? prediction_parameters.cfl_alpha_u
+                         : prediction_parameters.cfl_alpha_v);
+}
+
+template void Tile::ChromaFromLumaPrediction<uint8_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::ChromaFromLumaPrediction<uint16_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const TransformSize tx_size);
+#endif
+
+void Tile::InterIntraPrediction(
+    uint16_t* const prediction_0, const uint8_t* const prediction_mask,
+    const ptrdiff_t prediction_mask_stride,
+    const PredictionParameters& prediction_parameters,
+    const int prediction_width, const int prediction_height,
+    const int subsampling_x, const int subsampling_y, uint8_t* const dest,
+    const ptrdiff_t dest_stride) {
+  assert(prediction_mask != nullptr);
+  assert(prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeIntra ||
+         prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeWedge);
+  // The first buffer of InterIntra is from inter prediction.
+  // The second buffer is from intra prediction.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (sequence_header_.color_config.bitdepth > 8) {
+    GetMaskBlendFunc(dsp_, /*is_inter_intra=*/true,
+                     prediction_parameters.is_wedge_inter_intra, subsampling_x,
+                     subsampling_y)(
+        prediction_0, reinterpret_cast<uint16_t*>(dest),
+        dest_stride / sizeof(uint16_t), prediction_mask, prediction_mask_stride,
+        prediction_width, prediction_height, dest, dest_stride);
+    return;
+  }
+#endif
+  const int function_index = prediction_parameters.is_wedge_inter_intra
+                                 ? subsampling_x + subsampling_y
+                                 : 0;
+  // |is_inter_intra| prediction values are stored in a Pixel buffer but it is
+  // currently declared as a uint16_t buffer.
+  // TODO(johannkoenig): convert the prediction buffer to a uint8_t buffer and
+  // remove the reinterpret_cast.
+  dsp_.inter_intra_mask_blend_8bpp[function_index](
+      reinterpret_cast<uint8_t*>(prediction_0), dest, dest_stride,
+      prediction_mask, prediction_mask_stride, prediction_width,
+      prediction_height);
+}
+
+void Tile::CompoundInterPrediction(
+    const Block& block, const uint8_t* const prediction_mask,
+    const ptrdiff_t prediction_mask_stride, const int prediction_width,
+    const int prediction_height, const int subsampling_x,
+    const int subsampling_y, const int candidate_row,
+    const int candidate_column, uint8_t* dest, const ptrdiff_t dest_stride) {
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+
+  void* prediction[2];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  if (bitdepth > 8) {
+    prediction[0] = block.scratch_buffer->prediction_buffer[0];
+    prediction[1] = block.scratch_buffer->prediction_buffer[1];
+  } else {
+#endif
+    prediction[0] = block.scratch_buffer->compound_prediction_buffer_8bpp[0];
+    prediction[1] = block.scratch_buffer->compound_prediction_buffer_8bpp[1];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  }
+#endif
+
+  switch (prediction_parameters.compound_prediction_type) {
+    case kCompoundPredictionTypeWedge:
+    case kCompoundPredictionTypeDiffWeighted:
+      GetMaskBlendFunc(dsp_, /*is_inter_intra=*/false,
+                       prediction_parameters.is_wedge_inter_intra,
+                       subsampling_x, subsampling_y)(
+          prediction[0], prediction[1],
+          /*prediction_stride=*/prediction_width, prediction_mask,
+          prediction_mask_stride, prediction_width, prediction_height, dest,
+          dest_stride);
+      break;
+    case kCompoundPredictionTypeDistance:
+      DistanceWeightedPrediction(prediction[0], prediction[1], prediction_width,
+                                 prediction_height, candidate_row,
+                                 candidate_column, dest, dest_stride);
+      break;
+    default:
+      assert(prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeAverage);
+      dsp_.average_blend(prediction[0], prediction[1], prediction_width,
+                         prediction_height, dest, dest_stride);
+      break;
+  }
+}
+
+GlobalMotion* Tile::GetWarpParams(
+    const Block& block, const Plane plane, const int prediction_width,
+    const int prediction_height,
+    const PredictionParameters& prediction_parameters,
+    const ReferenceFrameType reference_type, bool* const is_local_valid,
+    GlobalMotion* const global_motion_params,
+    GlobalMotion* const local_warp_params) const {
+  if (prediction_width < 8 || prediction_height < 8 ||
+      frame_header_.force_integer_mv == 1) {
+    return nullptr;
+  }
+  if (plane == kPlaneY) {
+    *is_local_valid =
+        prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+        WarpEstimation(
+            prediction_parameters.num_warp_samples, DivideBy4(prediction_width),
+            DivideBy4(prediction_height), block.row4x4, block.column4x4,
+            block.bp->mv.mv[0], prediction_parameters.warp_estimate_candidates,
+            local_warp_params) &&
+        SetupShear(local_warp_params);
+  }
+  if (prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+      *is_local_valid) {
+    return local_warp_params;
+  }
+  if (!IsScaled(reference_type)) {
+    GlobalMotionTransformationType global_motion_type =
+        (reference_type != kReferenceFrameIntra)
+            ? global_motion_params->type
+            : kNumGlobalMotionTransformationTypes;
+    const bool is_global_valid =
+        IsGlobalMvBlock(*block.bp, global_motion_type) &&
+        SetupShear(global_motion_params);
+    // Valid global motion type implies reference type can't be intra.
+    assert(!is_global_valid || reference_type != kReferenceFrameIntra);
+    if (is_global_valid) return global_motion_params;
+  }
+  return nullptr;
+}
+
+bool Tile::InterPrediction(const Block& block, const Plane plane, const int x,
+                           const int y, const int prediction_width,
+                           const int prediction_height, int candidate_row,
+                           int candidate_column, bool* const is_local_valid,
+                           GlobalMotion* const local_warp_params) {
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const BlockParameters& bp = *block.bp;
+  const BlockParameters& bp_reference =
+      *block_parameters_holder_.Find(candidate_row, candidate_column);
+  const bool is_compound =
+      bp_reference.reference_frame[1] > kReferenceFrameIntra;
+  assert(bp.is_inter);
+  const bool is_inter_intra = bp.reference_frame[1] == kReferenceFrameIntra;
+
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  uint8_t* const dest = GetStartPoint(buffer_, plane, x, y, bitdepth);
+  const ptrdiff_t dest_stride = buffer_[plane].columns();  // In bytes.
+  for (int index = 0; index < 1 + static_cast<int>(is_compound); ++index) {
+    const ReferenceFrameType reference_type =
+        bp_reference.reference_frame[index];
+    GlobalMotion global_motion_params =
+        frame_header_.global_motion[reference_type];
+    GlobalMotion* warp_params =
+        GetWarpParams(block, plane, prediction_width, prediction_height,
+                      prediction_parameters, reference_type, is_local_valid,
+                      &global_motion_params, local_warp_params);
+    if (warp_params != nullptr) {
+      if (!BlockWarpProcess(block, plane, index, x, y, prediction_width,
+                            prediction_height, warp_params, is_compound,
+                            is_inter_intra, dest, dest_stride)) {
+        return false;
+      }
+    } else {
+      const int reference_index =
+          prediction_parameters.use_intra_block_copy
+              ? -1
+              : frame_header_.reference_frame_index[reference_type -
+                                                    kReferenceFrameLast];
+      if (!BlockInterPrediction(
+              block, plane, reference_index, bp_reference.mv.mv[index], x, y,
+              prediction_width, prediction_height, candidate_row,
+              candidate_column, block.scratch_buffer->prediction_buffer[index],
+              is_compound, is_inter_intra, dest, dest_stride)) {
+        return false;
+      }
+    }
+  }
+
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  ptrdiff_t prediction_mask_stride = 0;
+  const uint8_t* prediction_mask = nullptr;
+  if (prediction_parameters.compound_prediction_type ==
+      kCompoundPredictionTypeWedge) {
+    const Array2D<uint8_t>& wedge_mask =
+        wedge_masks_[GetWedgeBlockSizeIndex(block.size)]
+                    [prediction_parameters.wedge_sign]
+                    [prediction_parameters.wedge_index];
+    prediction_mask = wedge_mask[0];
+    prediction_mask_stride = wedge_mask.columns();
+  } else if (prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeIntra) {
+    // 7.11.3.13. The inter intra masks are precomputed and stored as a set of
+    // look up tables.
+    assert(prediction_parameters.inter_intra_mode < kNumInterIntraModes);
+    prediction_mask =
+        kInterIntraMasks[prediction_parameters.inter_intra_mode]
+                        [GetInterIntraMaskLookupIndex(prediction_width)]
+                        [GetInterIntraMaskLookupIndex(prediction_height)];
+    prediction_mask_stride = prediction_width;
+  } else if (prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeDiffWeighted) {
+    if (plane == kPlaneY) {
+      assert(prediction_width >= 8);
+      assert(prediction_height >= 8);
+      dsp_.weight_mask[FloorLog2(prediction_width) - 3]
+                      [FloorLog2(prediction_height) - 3]
+                      [static_cast<int>(prediction_parameters.mask_is_inverse)](
+                          block.scratch_buffer->prediction_buffer[0],
+                          block.scratch_buffer->prediction_buffer[1],
+                          block.scratch_buffer->weight_mask, block.width);
+    }
+    prediction_mask = block.scratch_buffer->weight_mask;
+    prediction_mask_stride = block.width;
+  }
+
+  if (is_compound) {
+    CompoundInterPrediction(block, prediction_mask, prediction_mask_stride,
+                            prediction_width, prediction_height, subsampling_x,
+                            subsampling_y, candidate_row, candidate_column,
+                            dest, dest_stride);
+  } else if (prediction_parameters.motion_mode == kMotionModeObmc) {
+    // Obmc mode is allowed only for single reference (!is_compound).
+    return ObmcPrediction(block, plane, prediction_width, prediction_height);
+  } else if (is_inter_intra) {
+    // InterIntra and obmc must be mutually exclusive.
+    InterIntraPrediction(
+        block.scratch_buffer->prediction_buffer[0], prediction_mask,
+        prediction_mask_stride, prediction_parameters, prediction_width,
+        prediction_height, subsampling_x, subsampling_y, dest, dest_stride);
+  }
+  return true;
+}
+
+bool Tile::ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+                               const Plane plane,
+                               const int reference_frame_index, const int width,
+                               const int height, const int x, const int y,
+                               const int candidate_row,
+                               const int candidate_column,
+                               const ObmcDirection blending_direction) {
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  // Obmc's prediction needs to be clipped before blending with above/left
+  // prediction blocks.
+  // Obmc prediction is used only when is_compound is false. So it is safe to
+  // use prediction_buffer[1] as a temporary buffer for the Obmc prediction.
+  static_assert(sizeof(block.scratch_buffer->prediction_buffer[1]) >=
+                    64 * 64 * sizeof(uint16_t),
+                "");
+  auto* const obmc_buffer =
+      reinterpret_cast<uint8_t*>(block.scratch_buffer->prediction_buffer[1]);
+  const ptrdiff_t obmc_buffer_stride =
+      (bitdepth == 8) ? width : width * sizeof(uint16_t);
+  if (!BlockInterPrediction(block, plane, reference_frame_index, mv, x, y,
+                            width, height, candidate_row, candidate_column,
+                            nullptr, false, false, obmc_buffer,
+                            obmc_buffer_stride)) {
+    return false;
+  }
+
+  uint8_t* const prediction = GetStartPoint(buffer_, plane, x, y, bitdepth);
+  const ptrdiff_t prediction_stride = buffer_[plane].columns();
+  dsp_.obmc_blend[blending_direction](prediction, prediction_stride, width,
+                                      height, obmc_buffer, obmc_buffer_stride);
+  return true;
+}
+
+bool Tile::ObmcPrediction(const Block& block, const Plane plane,
+                          const int width, const int height) {
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  if (block.top_available[kPlaneY] &&
+      !IsBlockSmallerThan8x8(block.residual_size[plane])) {
+    const int num_limit = std::min(uint8_t{4}, k4x4WidthLog2[block.size]);
+    const int column4x4_max =
+        std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+    const int candidate_row = block.row4x4 - 1;
+    const int block_start_y = MultiplyBy4(block.row4x4) >> subsampling_y;
+    int column4x4 = block.column4x4;
+    const int prediction_height = std::min(height >> 1, 32 >> subsampling_y);
+    for (int i = 0, step; i < num_limit && column4x4 < column4x4_max;
+         column4x4 += step) {
+      const int candidate_column = column4x4 | 1;
+      const BlockParameters& bp_top =
+          *block_parameters_holder_.Find(candidate_row, candidate_column);
+      const int candidate_block_size = bp_top.size;
+      step = Clip3(kNum4x4BlocksWide[candidate_block_size], 2, 16);
+      if (bp_top.reference_frame[0] > kReferenceFrameIntra) {
+        i++;
+        const int candidate_reference_frame_index =
+            frame_header_.reference_frame_index[bp_top.reference_frame[0] -
+                                                kReferenceFrameLast];
+        const int prediction_width =
+            std::min(width, MultiplyBy4(step) >> subsampling_x);
+        if (!ObmcBlockPrediction(
+                block, bp_top.mv.mv[0], plane, candidate_reference_frame_index,
+                prediction_width, prediction_height,
+                MultiplyBy4(column4x4) >> subsampling_x, block_start_y,
+                candidate_row, candidate_column, kObmcDirectionVertical)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  if (block.left_available[kPlaneY]) {
+    const int num_limit = std::min(uint8_t{4}, k4x4HeightLog2[block.size]);
+    const int row4x4_max =
+        std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+    const int candidate_column = block.column4x4 - 1;
+    int row4x4 = block.row4x4;
+    const int block_start_x = MultiplyBy4(block.column4x4) >> subsampling_x;
+    const int prediction_width = std::min(width >> 1, 32 >> subsampling_x);
+    for (int i = 0, step; i < num_limit && row4x4 < row4x4_max;
+         row4x4 += step) {
+      const int candidate_row = row4x4 | 1;
+      const BlockParameters& bp_left =
+          *block_parameters_holder_.Find(candidate_row, candidate_column);
+      const int candidate_block_size = bp_left.size;
+      step = Clip3(kNum4x4BlocksHigh[candidate_block_size], 2, 16);
+      if (bp_left.reference_frame[0] > kReferenceFrameIntra) {
+        i++;
+        const int candidate_reference_frame_index =
+            frame_header_.reference_frame_index[bp_left.reference_frame[0] -
+                                                kReferenceFrameLast];
+        const int prediction_height =
+            std::min(height, MultiplyBy4(step) >> subsampling_y);
+        if (!ObmcBlockPrediction(
+                block, bp_left.mv.mv[0], plane, candidate_reference_frame_index,
+                prediction_width, prediction_height, block_start_x,
+                MultiplyBy4(row4x4) >> subsampling_y, candidate_row,
+                candidate_column, kObmcDirectionHorizontal)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+                                      const int width, const int height,
+                                      const int candidate_row,
+                                      const int candidate_column, uint8_t* dest,
+                                      ptrdiff_t dest_stride) {
+  int distance[2];
+  int weight[2];
+  for (int reference = 0; reference < 2; ++reference) {
+    const BlockParameters& bp =
+        *block_parameters_holder_.Find(candidate_row, candidate_column);
+    // Note: distance[0] and distance[1] correspond to relative distance
+    // between current frame and reference frame [1] and [0], respectively.
+    distance[1 - reference] = std::min(
+        std::abs(static_cast<int>(
+            current_frame_.reference_info()
+                ->relative_distance_from[bp.reference_frame[reference]])),
+        static_cast<int>(kMaxFrameDistance));
+  }
+  GetDistanceWeights(distance, weight);
+
+  dsp_.distance_weighted_blend(prediction_0, prediction_1, weight[0], weight[1],
+                               width, height, dest, dest_stride);
+}
+
+void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
+                             const int reference_frame_index, const int x,
+                             const int y, int* const start_x,
+                             int* const start_y, int* const step_x,
+                             int* const step_y) {
+  const int reference_upscaled_width =
+      (reference_frame_index == -1)
+          ? frame_header_.upscaled_width
+          : reference_frames_[reference_frame_index]->upscaled_width();
+  const int reference_height =
+      (reference_frame_index == -1)
+          ? frame_header_.height
+          : reference_frames_[reference_frame_index]->frame_height();
+  assert(2 * frame_header_.width >= reference_upscaled_width &&
+         2 * frame_header_.height >= reference_height &&
+         frame_header_.width <= 16 * reference_upscaled_width &&
+         frame_header_.height <= 16 * reference_height);
+  const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
+  const bool is_scaled_y = reference_height != frame_header_.height;
+  const int half_sample = 1 << (kSubPixelBits - 1);
+  int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
+  int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
+  const int rounding_offset =
+      DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
+  if (is_scaled_x) {
+    const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.width)) /
+                        frame_header_.width;
+    *step_x = RightShiftWithRoundingSigned(
+        scale_x, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_x += half_sample;
+    // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
+    // be up to 15 bits. So we use int64_t to hold base_x.
+    const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
+                           (half_sample << kReferenceScaleShift);
+    *start_x =
+        RightShiftWithRoundingSigned(
+            base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_x = 1 << kScaleSubPixelBits;
+    *start_x = LeftShift(orig_x, 6) + rounding_offset;
+  }
+  if (is_scaled_y) {
+    const int scale_y = ((reference_height << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.height)) /
+                        frame_header_.height;
+    *step_y = RightShiftWithRoundingSigned(
+        scale_y, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_y += half_sample;
+    const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
+                           (half_sample << kReferenceScaleShift);
+    *start_y =
+        RightShiftWithRoundingSigned(
+            base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_y = 1 << kScaleSubPixelBits;
+    *start_y = LeftShift(orig_y, 6) + rounding_offset;
+  }
+}
+
+// static.
+bool Tile::GetReferenceBlockPosition(
+    const int reference_frame_index, const bool is_scaled, const int width,
+    const int height, const int ref_start_x, const int ref_last_x,
+    const int ref_start_y, const int ref_last_y, const int start_x,
+    const int start_y, const int step_x, const int step_y,
+    const int left_border, const int right_border, const int top_border,
+    const int bottom_border, int* ref_block_start_x, int* ref_block_start_y,
+    int* ref_block_end_x, int* ref_block_end_y) {
+  *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0);
+  *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0);
+  if (reference_frame_index == -1) {
+    return false;
+  }
+  *ref_block_start_x -= kConvolveBorderLeftTop;
+  *ref_block_start_y -= kConvolveBorderLeftTop;
+  *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) +
+                     kConvolveBorderRight;
+  *ref_block_end_y =
+      GetPixelPositionFromHighScale(start_y, step_y, height - 1) +
+      kConvolveBorderBottom;
+  if (is_scaled) {
+    const int block_height =
+        (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+         kScaleSubPixelBits) +
+        kSubPixelTaps;
+    *ref_block_end_x += kConvolveScaleBorderRight - kConvolveBorderRight;
+    *ref_block_end_y = *ref_block_start_y + block_height - 1;
+  }
+  // Determines if we need to extend beyond the left/right/top/bottom border.
+  return *ref_block_start_x < (ref_start_x - left_border) ||
+         *ref_block_end_x > (ref_last_x + right_border) ||
+         *ref_block_start_y < (ref_start_y - top_border) ||
+         *ref_block_end_y > (ref_last_y + bottom_border);
+}
+
+// Builds a block as the input for convolve, by copying the content of
+// reference frame (either a decoded reference frame, or current frame).
+// |block_extended_width| is the combined width of the block and its borders.
+template <typename Pixel>
+void Tile::BuildConvolveBlock(
+    const Plane plane, const int reference_frame_index, const bool is_scaled,
+    const int height, const int ref_start_x, const int ref_last_x,
+    const int ref_start_y, const int ref_last_y, const int step_y,
+    const int ref_block_start_x, const int ref_block_end_x,
+    const int ref_block_start_y, uint8_t* block_buffer,
+    ptrdiff_t convolve_buffer_stride, ptrdiff_t block_extended_width) {
+  const YuvBuffer* const reference_buffer =
+      (reference_frame_index == -1)
+          ? current_frame_.buffer()
+          : reference_frames_[reference_frame_index]->buffer();
+  Array2DView<const Pixel> reference_block(
+      reference_buffer->height(plane),
+      reference_buffer->stride(plane) / sizeof(Pixel),
+      reinterpret_cast<const Pixel*>(reference_buffer->data(plane)));
+  auto* const block_head = reinterpret_cast<Pixel*>(block_buffer);
+  convolve_buffer_stride /= sizeof(Pixel);
+  int block_height = height + kConvolveBorderLeftTop + kConvolveBorderBottom;
+  if (is_scaled) {
+    block_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+                    kScaleSubPixelBits) +
+                   kSubPixelTaps;
+  }
+  const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x);
+  const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y);
+  const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x);
+  const int block_width = copy_end_x - copy_start_x + 1;
+  const bool extend_left = ref_block_start_x < ref_start_x;
+  const bool extend_right = ref_block_end_x > ref_last_x;
+  const bool out_of_left = copy_start_x > ref_block_end_x;
+  const bool out_of_right = copy_end_x < ref_block_start_x;
+  if (out_of_left || out_of_right) {
+    const int ref_x = out_of_left ? copy_start_x : copy_end_x;
+    Pixel* buf_ptr = block_head;
+    for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+      Memset(buf_ptr, reference_block[ref_y][ref_x], block_extended_width);
+      if (ref_block_start_y + y >= ref_start_y &&
+          ref_block_start_y + y < ref_last_y) {
+        ++ref_y;
+      }
+      buf_ptr += convolve_buffer_stride;
+    }
+  } else {
+    Pixel* buf_ptr = block_head;
+    const int left_width = copy_start_x - ref_block_start_x;
+    for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+      if (extend_left) {
+        Memset(buf_ptr, reference_block[ref_y][copy_start_x], left_width);
+      }
+      memcpy(buf_ptr + left_width, &reference_block[ref_y][copy_start_x],
+             block_width * sizeof(Pixel));
+      if (extend_right) {
+        Memset(buf_ptr + left_width + block_width,
+               reference_block[ref_y][copy_end_x],
+               block_extended_width - left_width - block_width);
+      }
+      if (ref_block_start_y + y >= ref_start_y &&
+          ref_block_start_y + y < ref_last_y) {
+        ++ref_y;
+      }
+      buf_ptr += convolve_buffer_stride;
+    }
+  }
+}
+
+bool Tile::BlockInterPrediction(
+    const Block& block, const Plane plane, const int reference_frame_index,
+    const MotionVector& mv, const int x, const int y, const int width,
+    const int height, const int candidate_row, const int candidate_column,
+    uint16_t* const prediction, const bool is_compound,
+    const bool is_inter_intra, uint8_t* const dest,
+    const ptrdiff_t dest_stride) {
+  const BlockParameters& bp =
+      *block_parameters_holder_.Find(candidate_row, candidate_column);
+  int start_x;
+  int start_y;
+  int step_x;
+  int step_y;
+  ScaleMotionVector(mv, plane, reference_frame_index, x, y, &start_x, &start_y,
+                    &step_x, &step_y);
+  const int horizontal_filter_index = bp.interpolation_filter[1];
+  const int vertical_filter_index = bp.interpolation_filter[0];
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  // reference_frame_index equal to -1 indicates using current frame as
+  // reference.
+  const YuvBuffer* const reference_buffer =
+      (reference_frame_index == -1)
+          ? current_frame_.buffer()
+          : reference_frames_[reference_frame_index]->buffer();
+  const int reference_upscaled_width =
+      (reference_frame_index == -1)
+          ? MultiplyBy4(frame_header_.columns4x4)
+          : reference_frames_[reference_frame_index]->upscaled_width();
+  const int reference_height =
+      (reference_frame_index == -1)
+          ? MultiplyBy4(frame_header_.rows4x4)
+          : reference_frames_[reference_frame_index]->frame_height();
+  const int ref_start_x = 0;
+  const int ref_last_x =
+      SubsampledValue(reference_upscaled_width, subsampling_x) - 1;
+  const int ref_start_y = 0;
+  const int ref_last_y = SubsampledValue(reference_height, subsampling_y) - 1;
+
+  const bool is_scaled = (reference_frame_index != -1) &&
+                         (frame_header_.width != reference_upscaled_width ||
+                          frame_header_.height != reference_height);
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const int pixel_size = (bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  int ref_block_start_x;
+  int ref_block_start_y;
+  int ref_block_end_x;
+  int ref_block_end_y;
+  const bool extend_block = GetReferenceBlockPosition(
+      reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x,
+      ref_start_y, ref_last_y, start_x, start_y, step_x, step_y,
+      reference_buffer->left_border(plane),
+      reference_buffer->right_border(plane),
+      reference_buffer->top_border(plane),
+      reference_buffer->bottom_border(plane), &ref_block_start_x,
+      &ref_block_start_y, &ref_block_end_x, &ref_block_end_y);
+
+  // In frame parallel mode, ensure that the reference block has been decoded
+  // and available for referencing.
+  if (reference_frame_index != -1 && frame_parallel_) {
+    // For U and V planes with subsampling, we need to multiply the value of
+    // ref_block_end_y by 2 since we only track the progress of the Y planes.
+    const int reference_y_max = LeftShift(
+        std::min(ref_block_end_y + kSubPixelTaps, ref_last_y), subsampling_y);
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
+      return false;
+    }
+  }
+
+  const uint8_t* block_start = nullptr;
+  ptrdiff_t convolve_buffer_stride;
+  if (!extend_block) {
+    const YuvBuffer* const reference_buffer =
+        (reference_frame_index == -1)
+            ? current_frame_.buffer()
+            : reference_frames_[reference_frame_index]->buffer();
+    convolve_buffer_stride = reference_buffer->stride(plane);
+    if (reference_frame_index == -1 || is_scaled) {
+      block_start = reference_buffer->data(plane) +
+                    ref_block_start_y * reference_buffer->stride(plane) +
+                    ref_block_start_x * pixel_size;
+    } else {
+      block_start = reference_buffer->data(plane) +
+                    (ref_block_start_y + kConvolveBorderLeftTop) *
+                        reference_buffer->stride(plane) +
+                    (ref_block_start_x + kConvolveBorderLeftTop) * pixel_size;
+    }
+  } else {
+    const int border_right =
+        is_scaled ? kConvolveScaleBorderRight : kConvolveBorderRight;
+    // The block width can be at most 2 times as much as current
+    // block's width because of scaling.
+    auto block_extended_width = Align<ptrdiff_t>(
+        (2 * width + kConvolveBorderLeftTop + border_right) * pixel_size,
+        kMaxAlignment);
+    convolve_buffer_stride = block.scratch_buffer->convolve_block_buffer_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) {
+      BuildConvolveBlock<uint16_t>(
+          plane, reference_frame_index, is_scaled, height, ref_start_x,
+          ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+          ref_block_end_x, ref_block_start_y,
+          block.scratch_buffer->convolve_block_buffer.get(),
+          convolve_buffer_stride, block_extended_width);
+    } else {
+#endif
+      BuildConvolveBlock<uint8_t>(
+          plane, reference_frame_index, is_scaled, height, ref_start_x,
+          ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+          ref_block_end_x, ref_block_start_y,
+          block.scratch_buffer->convolve_block_buffer.get(),
+          convolve_buffer_stride, block_extended_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif
+    block_start = block.scratch_buffer->convolve_block_buffer.get() +
+                  (is_scaled ? 0
+                             : kConvolveBorderLeftTop * convolve_buffer_stride +
+                                   kConvolveBorderLeftTop * pixel_size);
+  }
+
+  void* const output =
+      (is_compound || is_inter_intra) ? prediction : static_cast<void*>(dest);
+  ptrdiff_t output_stride = (is_compound || is_inter_intra)
+                                ? /*prediction_stride=*/width
+                                : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // |is_inter_intra| calculations are written to the |prediction| buffer.
+  // Unlike the |is_compound| calculations the output is Pixel and not uint16_t.
+  // convolve_func() expects |output_stride| to be in bytes and not Pixels.
+  // |prediction_stride| is in units of uint16_t. Adjust |output_stride| to
+  // account for this.
+  if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+    output_stride *= 2;
+  }
+#endif
+  assert(output != nullptr);
+  if (is_scaled) {
+    dsp::ConvolveScaleFunc convolve_func = dsp_.convolve_scale[is_compound];
+    assert(convolve_func != nullptr);
+
+    convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+                  vertical_filter_index, start_x, start_y, step_x, step_y,
+                  width, height, output, output_stride);
+  } else {
+    const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask;
+    const int vertical_filter_id = (start_y >> 6) & kSubPixelMask;
+
+    dsp::ConvolveFunc convolve_func =
+        dsp_.convolve[reference_frame_index == -1][is_compound]
+                     [vertical_filter_id != 0][horizontal_filter_id != 0];
+    assert(convolve_func != nullptr);
+
+    convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+                  vertical_filter_index, horizontal_filter_id,
+                  vertical_filter_id, width, height, output, output_stride);
+  }
+  return true;
+}
+
+bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
+                            const int index, const int block_start_x,
+                            const int block_start_y, const int width,
+                            const int height, GlobalMotion* const warp_params,
+                            const bool is_compound, const bool is_inter_intra,
+                            uint8_t* const dest, const ptrdiff_t dest_stride) {
+  assert(width >= 8 && height >= 8);
+  const BlockParameters& bp = *block.bp;
+  const int reference_frame_index =
+      frame_header_.reference_frame_index[bp.reference_frame[index] -
+                                          kReferenceFrameLast];
+  const uint8_t* const source =
+      reference_frames_[reference_frame_index]->buffer()->data(plane);
+  ptrdiff_t source_stride =
+      reference_frames_[reference_frame_index]->buffer()->stride(plane);
+  const int source_width =
+      reference_frames_[reference_frame_index]->buffer()->width(plane);
+  const int source_height =
+      reference_frames_[reference_frame_index]->buffer()->height(plane);
+  uint16_t* const prediction = block.scratch_buffer->prediction_buffer[index];
+
+  // In frame parallel mode, ensure that the reference block has been decoded
+  // and available for referencing.
+  if (frame_parallel_) {
+    int reference_y_max = -1;
+    // Find out the maximum y-coordinate for warping.
+    for (int start_y = block_start_y; start_y < block_start_y + height;
+         start_y += 8) {
+      for (int start_x = block_start_x; start_x < block_start_x + width;
+           start_x += 8) {
+        const int src_x = (start_x + 4) << subsampling_x_[plane];
+        const int src_y = (start_y + 4) << subsampling_y_[plane];
+        const int64_t dst_y =
+            src_x * warp_params->params[4] +
+            static_cast<int64_t>(src_y) * warp_params->params[5] +
+            warp_params->params[1];
+        const int64_t y4 = dst_y >> subsampling_y_[plane];
+        const int iy4 = static_cast<int>(y4 >> kWarpedModelPrecisionBits);
+        reference_y_max = std::max(iy4 + 8, reference_y_max);
+      }
+    }
+    // For U and V planes with subsampling, we need to multiply reference_y_max
+    // by 2 since we only track the progress of Y planes.
+    reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]);
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
+      return false;
+    }
+  }
+  if (is_compound) {
+    dsp_.warp_compound(source, source_stride, source_width, source_height,
+                       warp_params->params, subsampling_x_[plane],
+                       subsampling_y_[plane], block_start_x, block_start_y,
+                       width, height, warp_params->alpha, warp_params->beta,
+                       warp_params->gamma, warp_params->delta, prediction,
+                       /*prediction_stride=*/width);
+  } else {
+    void* const output = is_inter_intra ? static_cast<void*>(prediction) : dest;
+    ptrdiff_t output_stride =
+        is_inter_intra ? /*prediction_stride=*/width : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    // |is_inter_intra| calculations are written to the |prediction| buffer.
+    // Unlike the |is_compound| calculations the output is Pixel and not
+    // uint16_t. warp_clip() expects |output_stride| to be in bytes and not
+    // Pixels. |prediction_stride| is in units of uint16_t. Adjust
+    // |output_stride| to account for this.
+    if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+      output_stride *= 2;
+    }
+#endif
+    dsp_.warp(source, source_stride, source_width, source_height,
+              warp_params->params, subsampling_x_[plane], subsampling_y_[plane],
+              block_start_x, block_start_y, width, height, warp_params->alpha,
+              warp_params->beta, warp_params->gamma, warp_params->delta, output,
+              output_stride);
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/tile.cc b/src/tile/tile.cc
new file mode 100644 (file)
index 0000000..10ebbf2
--- /dev/null
@@ -0,0 +1,2670 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/motion_vector.h"
+#include "src/reconstruction.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+// Range above kNumQuantizerBaseLevels which the exponential golomb coding
+// process is activated.
+constexpr int kQuantizerCoefficientBaseRange = 12;
+constexpr int kNumQuantizerBaseLevels = 2;
+constexpr int kCoeffBaseRangeMaxIterations =
+    kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
+constexpr int kEntropyContextLeft = 0;
+constexpr int kEntropyContextTop = 1;
+
+constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
+                                                     {2, 4, 4, 4, 5},
+                                                     {2, 4, 4, 4, 5},
+                                                     {2, 4, 4, 4, 5},
+                                                     {3, 5, 5, 5, 6}};
+
+// The space complexity of DFS is O(branching_factor * max_depth). For the
+// parameter tree, branching_factor = 4 (there could be up to 4 children for
+// every node) and max_depth (excluding the root) = 5 (to go from a 128x128
+// block all the way to a 4x4 block). The worse-case stack size is 16, by
+// counting the number of 'o' nodes in the diagram:
+//
+//   |                    128x128  The highest level (corresponding to the
+//   |                             root of the tree) has no node in the stack.
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  64x64
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  32x32    Higher levels have three nodes in the stack,
+//   |                             because we pop one node off the stack before
+//   |-----------------+           pushing its four children onto the stack.
+//   |     |     |     |
+//   |     o     o     o  16x16
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  8x8
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   o     o     o     o  4x4      Only the lowest level has four nodes in the
+//                                 stack.
+constexpr int kDfsStackSize = 16;
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+    BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
+    BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+constexpr PredictionMode
+    kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
+        kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+        kPredictionModeD157, kPredictionModeDc};
+
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+    kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+    kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+    kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+    kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+    kPredictionModeNewNewMv);
+
+// This is computed as:
+// min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
+constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
+    0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
+
+/* clang-format off */
+constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
+    {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
+     {0, 0, 0, 0, 0}},
+    {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+     {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+    {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+     {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
+/* clang-format on */
+
+// Extended the table size from 3 to 16 by repeating the last element to avoid
+// the clips to row or column indices.
+constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
+    26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
+
+constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
+    kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+    kPredictionModeSmooth};
+
+// Number of horizontal luma samples before intra block copy can be used.
+constexpr int kIntraBlockCopyDelayPixels = 256;
+// Number of 64 by 64 blocks before intra block copy can be used.
+constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
+
+// Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
+// height 1 << (j + 2).
+constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
+    {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+     kNumTransformSizes, kNumTransformSizes},
+    {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+     kTransformSize8x32, kNumTransformSizes},
+    {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
+     kTransformSize16x32, kTransformSize16x64},
+    {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
+     kTransformSize32x32, kTransformSize32x64},
+    {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
+     kTransformSize64x32, kTransformSize64x64}};
+
+// Defined in section 9.3 of the spec.
+constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
+    kTransformTypeDctDct,   kTransformTypeDctAdst,  kTransformTypeAdstDct,
+    kTransformTypeDctDct,   kTransformTypeAdstAdst, kTransformTypeDctAdst,
+    kTransformTypeAdstDct,  kTransformTypeAdstDct,  kTransformTypeDctAdst,
+    kTransformTypeAdstAdst, kTransformTypeDctAdst,  kTransformTypeAdstDct,
+    kTransformTypeAdstAdst, kTransformTypeDctDct};
+
+// Defined in section 5.11.47 of the spec. This array does not contain an entry
+// for kTransformSetDctOnly, so the first dimension needs to be
+// |kNumTransformSets| - 1.
+constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
+    {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+      kTransformTypeIdentityDct, kTransformTypeDctIdentity,
+      kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+     {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+      kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+     {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+      kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
+      kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
+      kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
+      kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+      kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+      kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+      kTransformTypeAdstFlipadst},
+     {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+      kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
+      kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+      kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+      kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+      kTransformTypeAdstFlipadst},
+     {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
+
+// Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
+constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32};
+
+// This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
+// transforms replaced with *x32 and 32x* respectively.
+constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+    kTransformSize32x32};
+
+// ith entry of this array is computed as:
+// DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
+//           TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
+//           1)
+constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
+    0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
+
+constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
+
+constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
+
+// Maps compound prediction modes into single modes. For e.g.
+// kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
+// and kPredictionModeNewMv for index 1. It is used to simplify the logic in
+// AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
+constexpr PredictionMode
+    kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
+        {kPredictionModeNearestMv, kPredictionModeNearestMv},
+        {kPredictionModeNearMv, kPredictionModeNearMv},
+        {kPredictionModeNearestMv, kPredictionModeNewMv},
+        {kPredictionModeNewMv, kPredictionModeNearestMv},
+        {kPredictionModeNearMv, kPredictionModeNewMv},
+        {kPredictionModeNewMv, kPredictionModeNearMv},
+        {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
+        {kPredictionModeNewMv, kPredictionModeNewMv},
+};
+PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
+  if (y_mode < kPredictionModeNearestNearestMv) {
+    return y_mode;
+  }
+  const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
+  assert(lookup_index >= 0);
+  return kCompoundToSinglePredictionMode[lookup_index][index];
+}
+
+// log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
+// dqDenom is always a power of two and hence right shift can be used instead of
+// division.
+constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
+
+// Returns the minimum of |length| or |max|-|start|. This is used to clamp array
+// indices when accessing arrays whose bound is equal to |max|.
+int GetNumElements(int length, int start, int max) {
+  return std::min(length, max - start);
+}
+
+template <typename T>
+void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  // Specialize all columns cases (values in kTransformWidth4x4[]) for better
+  // performance.
+  switch (columns) {
+    case 1:
+      MemSetBlock<T>(rows, 1, value, dst, stride);
+      break;
+    case 2:
+      MemSetBlock<T>(rows, 2, value, dst, stride);
+      break;
+    case 4:
+      MemSetBlock<T>(rows, 4, value, dst, stride);
+      break;
+    case 8:
+      MemSetBlock<T>(rows, 8, value, dst, stride);
+      break;
+    default:
+      assert(columns == 16);
+      MemSetBlock<T>(rows, 16, value, dst, stride);
+      break;
+  }
+}
+
+void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
+                      TransformType tx_type,
+                      TransformType transform_types[32][32]) {
+  const int y_offset = y4 - block.row4x4;
+  const int x_offset = x4 - block.column4x4;
+  TransformType* const dst = &transform_types[y_offset][x_offset];
+  SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
+}
+
+void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
+                         const MotionVector& mv_to_store, ptrdiff_t stride,
+                         int rows, int columns,
+                         ReferenceFrameType* reference_frame_row_start,
+                         MotionVector* mv) {
+  static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
+  do {
+    // Don't switch the following two memory setting functions.
+    // Some ARM CPUs are quite sensitive to the order.
+    memset(reference_frame_row_start, reference_frame_to_store, columns);
+    std::fill(mv, mv + columns, mv_to_store);
+    reference_frame_row_start += stride;
+    mv += stride;
+  } while (--rows != 0);
+}
+
+// Inverse transform process assumes that the quantized coefficients are stored
+// as a virtual 2d array of size |tx_width| x tx_height. If transform width is
+// 64, then this assumption is broken because the scan order used for populating
+// the coefficients for such transforms is the same as the one used for
+// corresponding transform with width 32 (e.g. the scan order used for 64x16 is
+// the same as the one used for 32x16). So we must restore the coefficients to
+// their correct positions and clean the positions they occupied.
+template <typename ResidualType>
+void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
+                                  ResidualType* residual) {
+  if (tx_width != 64) return;
+  const int rows = clamped_tx_height - 2;
+  auto* src = residual + 32 * rows;
+  residual += 64 * rows;
+  // Process 2 rows in each loop in reverse order to avoid overwrite.
+  int x = rows >> 1;
+  do {
+    // The 2 rows can be processed in order.
+    memcpy(residual, src, 32 * sizeof(src[0]));
+    memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+    memset(src + 32, 0, 32 * sizeof(src[0]));
+    src -= 64;
+    residual -= 128;
+  } while (--x);
+  // Process the second row. The first row is already correct.
+  memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+  memset(src + 32, 0, 32 * sizeof(src[0]));
+}
+
+void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
+  // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
+  // and 5.11.54).
+  constexpr int kMvBorder4x4 = 4;
+  const int row_border = kMvBorder4x4 + block.height4x4;
+  const int column_border = kMvBorder4x4 + block.width4x4;
+  const int macroblocks_to_top_edge = -block.row4x4;
+  const int macroblocks_to_bottom_edge =
+      block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
+  const int macroblocks_to_left_edge = -block.column4x4;
+  const int macroblocks_to_right_edge =
+      block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
+  min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
+  min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
+  max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
+  max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
+}
+
+// Section 8.3.2 in the spec, under coeff_base_eob.
+int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
+  if (index == 0) return 0;
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_height = kTransformHeight[adjusted_tx_size];
+  if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
+  if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
+  return 3;
+}
+
+// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
+// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
+// the end of block case.
+int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
+                                TransformClass tx_class) {
+  if (pos == 0) return 0;
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  const int row = pos >> adjusted_tx_width_log2;
+  const int column = pos & (tx_width - 1);
+  // This return statement is equivalent to:
+  // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
+  //         (tx_class == kTransformClassHorizontal && column == 0) ||
+  //         (tx_class == kTransformClassVertical && row == 0))
+  //            ? 7
+  //            : 14;
+  return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
+                 static_cast<int>((row | column) < 2)) |
+                (tx_class & static_cast<int>(column == 0)) |
+                ((tx_class >> 1) & static_cast<int>(row == 0)));
+}
+
+}  // namespace
+
+Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
+           const ObuSequenceHeader& sequence_header,
+           const ObuFrameHeader& frame_header,
+           RefCountedBuffer* const current_frame, const DecoderState& state,
+           FrameScratchBuffer* const frame_scratch_buffer,
+           const WedgeMaskArray& wedge_masks,
+           const QuantizerMatrix& quantizer_matrix,
+           SymbolDecoderContext* const saved_symbol_decoder_context,
+           const SegmentationMap* prev_segment_ids,
+           PostFilter* const post_filter, const dsp::Dsp* const dsp,
+           ThreadPool* const thread_pool,
+           BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+           bool use_intra_prediction_buffer)
+    : number_(tile_number),
+      row_(number_ / frame_header.tile_info.tile_columns),
+      column_(number_ % frame_header.tile_info.tile_columns),
+      data_(data),
+      size_(size),
+      read_deltas_(false),
+      subsampling_x_{0, sequence_header.color_config.subsampling_x,
+                     sequence_header.color_config.subsampling_x},
+      subsampling_y_{0, sequence_header.color_config.subsampling_y,
+                     sequence_header.color_config.subsampling_y},
+      current_quantizer_index_(frame_header.quantizer.base_index),
+      sequence_header_(sequence_header),
+      frame_header_(frame_header),
+      reference_frame_sign_bias_(state.reference_frame_sign_bias),
+      reference_frames_(state.reference_frame),
+      motion_field_(frame_scratch_buffer->motion_field),
+      reference_order_hint_(state.reference_order_hint),
+      wedge_masks_(wedge_masks),
+      quantizer_matrix_(quantizer_matrix),
+      reader_(data_, size_, frame_header_.enable_cdf_update),
+      symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
+      saved_symbol_decoder_context_(saved_symbol_decoder_context),
+      prev_segment_ids_(prev_segment_ids),
+      dsp_(*dsp),
+      post_filter_(*post_filter),
+      block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
+      quantizer_(sequence_header_.color_config.bitdepth,
+                 &frame_header_.quantizer),
+      residual_size_((sequence_header_.color_config.bitdepth == 8)
+                         ? sizeof(int16_t)
+                         : sizeof(int32_t)),
+      intra_block_copy_lag_(
+          frame_header_.allow_intrabc
+              ? (sequence_header_.use_128x128_superblock ? 3 : 5)
+              : 1),
+      current_frame_(*current_frame),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      cdef_skip_(frame_scratch_buffer->cdef_skip),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+      thread_pool_(thread_pool),
+      residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
+      tile_scratch_buffer_pool_(
+          &frame_scratch_buffer->tile_scratch_buffer_pool),
+      pending_tiles_(pending_tiles),
+      frame_parallel_(frame_parallel),
+      use_intra_prediction_buffer_(use_intra_prediction_buffer),
+      intra_prediction_buffer_(
+          use_intra_prediction_buffer_
+              ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+              : nullptr) {
+  row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
+  row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
+  column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
+  column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
+  superblock_rows_ =
+      (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
+  superblock_columns_ =
+      (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
+      block_width4x4_log2;
+  // If |split_parse_and_decode_| is true, we do the necessary setup for
+  // splitting the parsing and the decoding steps. This is done in the following
+  // two cases:
+  //  1) If there is multi-threading within a tile (this is done if
+  //     |thread_pool_| is not nullptr and if there are at least as many
+  //     superblock columns as |intra_block_copy_lag_|).
+  //  2) If |frame_parallel| is true.
+  split_parse_and_decode_ = (thread_pool_ != nullptr &&
+                             superblock_columns_ > intra_block_copy_lag_) ||
+                            frame_parallel;
+  if (frame_parallel_) {
+    reference_frame_progress_cache_.fill(INT_MIN);
+  }
+  memset(delta_lf_, 0, sizeof(delta_lf_));
+  delta_lf_all_zero_ = true;
+  const YuvBuffer& buffer = post_filter_.frame_buffer();
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    // Verify that the borders are big enough for Reconstruct(). max_tx_length
+    // is the maximum value of tx_width and tx_height for the plane.
+    const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
+    // Reconstruct() may overwrite on the right. Since the right border of a
+    // row is followed in memory by the left border of the next row, the
+    // number of extra pixels to the right of a row is at least the sum of the
+    // left and right borders.
+    //
+    // Note: This assertion actually checks the sum of the left and right
+    // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
+    // and vertically shifted version of |buffer|. Since the sum of the left and
+    // right borders is not changed by the shift, we can just check the sum of
+    // the left and right borders of |buffer|.
+    assert(buffer.left_border(plane) + buffer.right_border(plane) >=
+           max_tx_length - 1);
+    // Reconstruct() may overwrite on the bottom. We need an extra border row
+    // on the bottom because we need the left border of that row.
+    //
+    // Note: This assertion checks the bottom border of
+    // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
+    // shift that the PostFilter constructor applied to |buffer| and reduce the
+    // bottom border by that amount.
+#ifndef NDEBUG
+    const int vertical_shift = static_cast<int>(
+        (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
+        buffer.stride(plane));
+    const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
+    assert(bottom_border >= max_tx_length);
+#endif
+    // In AV1, a transform block of height H starts at a y coordinate that is
+    // a multiple of H. If a transform block at the bottom of the frame has
+    // height H, then Reconstruct() will write up to the row with index
+    // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
+    // rows Reconstruct() may write to is
+    // Align(buffer.height(plane), max_tx_length).
+    buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
+                         buffer.stride(plane),
+                         post_filter_.GetUnfilteredBuffer(plane));
+  }
+}
+
+bool Tile::Init() {
+  assert(coefficient_levels_.size() == dc_categories_.size());
+  for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
+    const int contexts_per_plane = (i == kEntropyContextLeft)
+                                       ? frame_header_.rows4x4
+                                       : frame_header_.columns4x4;
+    if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
+      LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
+      return false;
+    }
+    if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
+      LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
+      return false;
+    }
+  }
+  if (split_parse_and_decode_) {
+    assert(residual_buffer_pool_ != nullptr);
+    if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
+                                         /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
+      return false;
+    }
+  } else {
+    // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
+    // checks when parsing quantized coefficients.
+    residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
+        32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
+    if (residual_buffer_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
+      return false;
+    }
+    prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
+    if (prediction_parameters_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
+      return false;
+    }
+  }
+  if (frame_header_.use_ref_frame_mvs) {
+    assert(sequence_header_.enable_order_hint);
+    SetupMotionField(frame_header_, current_frame_, reference_frames_,
+                     row4x4_start_, row4x4_end_, column4x4_start_,
+                     column4x4_end_, &motion_field_);
+  }
+  ResetLoopRestorationParams();
+  if (!top_context_.Resize(superblock_columns_)) {
+    LIBGAV1_DLOG(ERROR, "Allocation of top_context_ failed.");
+    return false;
+  }
+  return true;
+}
+
+template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+bool Tile::ProcessSuperBlockRow(int row4x4,
+                                TileScratchBuffer* const scratch_buffer) {
+  if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
+  assert(scratch_buffer != nullptr);
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
+       column4x4 += block_width4x4) {
+    if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
+                           processing_mode)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+  }
+  if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
+    SaveSymbolDecoderContext();
+  }
+  if (processing_mode == kProcessingModeDecodeOnly ||
+      processing_mode == kProcessingModeParseAndDecode) {
+    PopulateIntraPredictionBuffer(row4x4);
+  }
+  return true;
+}
+
+// Used in frame parallel mode. The symbol decoder context need not be saved in
+// this case since it was done when parsing was complete.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+// Used in non frame parallel mode.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+
+void Tile::SaveSymbolDecoderContext() {
+  if (frame_header_.enable_frame_end_update_cdf &&
+      number_ == frame_header_.tile_info.context_update_id) {
+    *saved_symbol_decoder_context_ = symbol_decoder_context_;
+  }
+}
+
+bool Tile::ParseAndDecode() {
+  if (split_parse_and_decode_) {
+    if (!ThreadedParseAndDecode()) return false;
+    SaveSymbolDecoderContext();
+    return true;
+  }
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    pending_tiles_->Decrement(false);
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+            row4x4, scratch_buffer.get())) {
+      pending_tiles_->Decrement(false);
+      return false;
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  pending_tiles_->Decrement(true);
+  return true;
+}
+
+bool Tile::Parse() {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  SaveSymbolDecoderContext();
+  return true;
+}
+
+bool Tile::Decode(
+    std::mutex* const mutex, int* const superblock_row_progress,
+    std::condition_variable* const superblock_row_progress_condvar) {
+  const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header_.use_128x128_superblock ? 5 : 4;
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+       row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+    if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+    if (post_filter_.DoDeblock()) {
+      // Apply vertical deblock filtering for all the columns in this tile
+      // except for the first 64 columns.
+      post_filter_.ApplyDeblockFilter(
+          kLoopFilterTypeVertical, row4x4,
+          column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+          block_width4x4);
+      // If this is the first superblock row of the tile, then we cannot apply
+      // horizontal deblocking here since we don't know if the top row is
+      // available. So it will be done by the calling thread in that case.
+      if (row4x4 != row4x4_start_) {
+        // Apply horizontal deblock filtering for all the columns in this tile
+        // except for the first and the last 64 columns.
+        // Note about the last tile of each row: For the last tile,
+        // column4x4_end may not be a multiple of 16. In that case it is still
+        // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+        // the filters in increments of 64 columns (or 32 columns for chroma
+        // with subsampling).
+        post_filter_.ApplyDeblockFilter(
+            kLoopFilterTypeHorizontal, row4x4,
+            column4x4_start_ + kNum4x4InLoopFilterUnit,
+            column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+      }
+    }
+    bool notify;
+    {
+      std::unique_lock<std::mutex> lock(*mutex);
+      notify = ++superblock_row_progress[index] ==
+               frame_header_.tile_info.tile_columns;
+    }
+    if (notify) {
+      // We are done decoding this superblock row. Notify the post filtering
+      // thread.
+      superblock_row_progress_condvar[index].notify_one();
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  return true;
+}
+
+bool Tile::ThreadedParseAndDecode() {
+  {
+    std::lock_guard<std::mutex> lock(threading_.mutex);
+    if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
+      pending_tiles_->Decrement(false);
+      LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
+      return false;
+    }
+    // Account for the parsing job.
+    ++threading_.pending_jobs;
+  }
+
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+
+  // Begin parsing.
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    pending_tiles_->Decrement(false);
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4, ++row_index) {
+    for (int column4x4 = column4x4_start_, column_index = 0;
+         column4x4 < column4x4_end_;
+         column4x4 += block_width4x4, ++column_index) {
+      if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                             kProcessingModeParseOnly)) {
+        std::lock_guard<std::mutex> lock(threading_.mutex);
+        threading_.abort = true;
+        break;
+      }
+      std::unique_lock<std::mutex> lock(threading_.mutex);
+      if (threading_.abort) break;
+      threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
+      // Schedule the decoding of this superblock if it is allowed.
+      if (CanDecode(row_index, column_index)) {
+        ++threading_.pending_jobs;
+        threading_.sb_state[row_index][column_index] =
+            kSuperBlockStateScheduled;
+        lock.unlock();
+        thread_pool_->Schedule(
+            [this, row_index, column_index, block_width4x4]() {
+              DecodeSuperBlock(row_index, column_index, block_width4x4);
+            });
+      }
+    }
+    std::lock_guard<std::mutex> lock(threading_.mutex);
+    if (threading_.abort) break;
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+
+  // We are done parsing. We can return here since the calling thread will make
+  // sure that it waits for all the superblocks to be decoded.
+  //
+  // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+  // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+  // is called.
+  threading_.mutex.lock();
+  const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+  const bool job_succeeded = !threading_.abort;
+  threading_.mutex.unlock();
+  if (no_pending_jobs) {
+    // We are done parsing and decoding this tile.
+    pending_tiles_->Decrement(job_succeeded);
+  }
+  return job_succeeded;
+}
+
+bool Tile::CanDecode(int row_index, int column_index) const {
+  assert(row_index >= 0);
+  assert(column_index >= 0);
+  // If |threading_.sb_state[row_index][column_index]| is not equal to
+  // kSuperBlockStateParsed, then return false. This is ok because if
+  // |threading_.sb_state[row_index][column_index]| is equal to:
+  //   kSuperBlockStateNone - then the superblock is not yet parsed.
+  //   kSuperBlockStateScheduled - then the superblock is already scheduled for
+  //                               decode.
+  //   kSuperBlockStateDecoded - then the superblock has already been decoded.
+  if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
+      threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
+    return false;
+  }
+  // First superblock has no dependencies.
+  if (row_index == 0 && column_index == 0) {
+    return true;
+  }
+  // Superblocks in the first row only depend on the superblock to the left of
+  // it.
+  if (row_index == 0) {
+    return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
+  }
+  // All other superblocks depend on superblock to the left of it (if one
+  // exists) and superblock to the top right with a lag of
+  // |intra_block_copy_lag_| (if one exists).
+  const int top_right_column_index =
+      std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
+  return threading_.sb_state[row_index - 1][top_right_column_index] ==
+             kSuperBlockStateDecoded &&
+         (column_index == 0 ||
+          threading_.sb_state[row_index][column_index - 1] ==
+              kSuperBlockStateDecoded);
+}
+
+void Tile::DecodeSuperBlock(int row_index, int column_index,
+                            int block_width4x4) {
+  const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
+  const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  bool ok = scratch_buffer != nullptr;
+  if (ok) {
+    ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                           kProcessingModeDecodeOnly);
+    tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  }
+  std::unique_lock<std::mutex> lock(threading_.mutex);
+  if (ok) {
+    threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
+    // Candidate rows and columns that we could potentially begin the decoding
+    // (if it is allowed to do so). The candidates are:
+    //   1) The superblock to the bottom-left of the current superblock with a
+    //   lag of |intra_block_copy_lag_| (or the beginning of the next superblock
+    //   row in case there are less than |intra_block_copy_lag_| superblock
+    //   columns in the Tile).
+    //   2) The superblock to the right of the current superblock.
+    const int candidate_row_indices[] = {row_index + 1, row_index};
+    const int candidate_column_indices[] = {
+        std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
+    for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
+         ++i) {
+      const int candidate_row_index = candidate_row_indices[i];
+      const int candidate_column_index = candidate_column_indices[i];
+      if (!CanDecode(candidate_row_index, candidate_column_index)) {
+        continue;
+      }
+      ++threading_.pending_jobs;
+      threading_.sb_state[candidate_row_index][candidate_column_index] =
+          kSuperBlockStateScheduled;
+      lock.unlock();
+      thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
+                              block_width4x4]() {
+        DecodeSuperBlock(candidate_row_index, candidate_column_index,
+                         block_width4x4);
+      });
+      lock.lock();
+    }
+  } else {
+    threading_.abort = true;
+  }
+  // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+  // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+  // is called.
+  const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+  const bool job_succeeded = !threading_.abort;
+  lock.unlock();
+  if (no_pending_jobs) {
+    // We are done parsing and decoding this tile.
+    pending_tiles_->Decrement(job_succeeded);
+  }
+}
+
+void Tile::PopulateIntraPredictionBuffer(int row4x4) {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
+    return;
+  }
+  const size_t pixel_size =
+      (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                   : sizeof(uint16_t));
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    const int row_to_copy =
+        (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
+    const size_t pixels_to_copy =
+        (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
+         subsampling_x_[plane]) *
+        pixel_size;
+    const size_t column_start =
+        MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+    void* start;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (sequence_header_.color_config.bitdepth > 8) {
+      Array2DView<uint16_t> buffer(
+          buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+          reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+      start = &buffer[row_to_copy][column_start];
+    } else  // NOLINT
+#endif
+    {
+      start = &buffer_[plane][row_to_copy][column_start];
+    }
+    memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+           start, pixels_to_copy);
+  }
+}
+
+int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
+                                     TransformSize tx_size, int x4, int y4,
+                                     int w4, int h4) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const BlockSize plane_size = block.residual_size[plane];
+  const int block_width = kBlockWidthPixels[plane_size];
+  const int block_height = kBlockHeightPixels[plane_size];
+
+  int top = 0;
+  int left = 0;
+  const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+  const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+  if (plane == kPlaneY) {
+    if (block_width == tx_width && block_height == tx_height) return 0;
+    const uint8_t* coefficient_levels =
+        &coefficient_levels_[kEntropyContextTop][plane][x4];
+    for (int i = 0; i < num_top_elements; ++i) {
+      top = std::max(top, static_cast<int>(coefficient_levels[i]));
+    }
+    coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+    for (int i = 0; i < num_left_elements; ++i) {
+      left = std::max(left, static_cast<int>(coefficient_levels[i]));
+    }
+    assert(top <= 4);
+    assert(left <= 4);
+    // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
+    // for top and left.
+    return kAllZeroContextsByTopLeft[top][left];
+  }
+  const uint8_t* coefficient_levels =
+      &coefficient_levels_[kEntropyContextTop][plane][x4];
+  const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+  for (int i = 0; i < num_top_elements; ++i) {
+    top |= coefficient_levels[i];
+    top |= dc_categories[i];
+  }
+  coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+  dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+  for (int i = 0; i < num_left_elements; ++i) {
+    left |= coefficient_levels[i];
+    left |= dc_categories[i];
+  }
+  return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
+         3 * static_cast<int>(block_width * block_height >
+                              tx_width * tx_height);
+}
+
+TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
+  const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
+  if (is_inter) {
+    if (frame_header_.reduced_tx_set ||
+        tx_size_square_max == kTransformSize32x32) {
+      return kTransformSetInter3;
+    }
+    if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
+    return kTransformSetInter1;
+  }
+  if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
+  if (frame_header_.reduced_tx_set ||
+      tx_size_square_min == kTransformSize16x16) {
+    return kTransformSetIntra2;
+  }
+  return kTransformSetIntra1;
+}
+
+TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
+                                         TransformSize tx_size, int block_x,
+                                         int block_y) {
+  const BlockParameters& bp = *block.bp;
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  if (frame_header_.segmentation
+          .lossless[bp.prediction_parameters->segment_id] ||
+      tx_size_square_max == kTransformSize64x64) {
+    return kTransformTypeDctDct;
+  }
+  if (plane == kPlaneY) {
+    return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
+  }
+  const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+  TransformType tx_type;
+  if (bp.is_inter) {
+    const int x4 =
+        std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
+    const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
+    tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
+  } else {
+    tx_type = kModeToTransformType[bp.prediction_parameters->uv_mode];
+  }
+  return kTransformTypeInSetMask[tx_set].Contains(tx_type)
+             ? tx_type
+             : kTransformTypeDctDct;
+}
+
+void Tile::ReadTransformType(const Block& block, int x4, int y4,
+                             TransformSize tx_size) {
+  BlockParameters& bp = *block.bp;
+  const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+
+  TransformType tx_type = kTransformTypeDctDct;
+  if (tx_set != kTransformSetDctOnly &&
+      frame_header_.segmentation.qindex[bp.prediction_parameters->segment_id] >
+          0) {
+    const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
+    const int cdf_tx_size_index =
+        TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
+    uint16_t* cdf;
+    if (bp.is_inter) {
+      cdf = symbol_decoder_context_
+                .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
+      switch (tx_set) {
+        case kTransformSetInter1:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
+          break;
+        case kTransformSetInter2:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
+          break;
+        default:
+          assert(tx_set == kTransformSetInter3);
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
+          break;
+      }
+    } else {
+      const PredictionMode intra_direction =
+          block.bp->prediction_parameters->use_filter_intra
+              ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
+                                                     ->filter_intra_mode]
+              : bp.y_mode;
+      cdf =
+          symbol_decoder_context_
+              .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
+      assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
+      tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
+                                               ? reader_.ReadSymbol<7>(cdf)
+                                               : reader_.ReadSymbol<5>(cdf));
+    }
+
+    // This array does not contain an entry for kTransformSetDctOnly, so the
+    // first dimension needs to be offset by 1.
+    tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
+  }
+  SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
+                   kTransformHeight4x4[tx_size], tx_type, transform_types_);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the two right neighbors and the
+// one bottom-right neighbor may be out of boundary. We don't check the right
+// boundary for them, because the out of boundary neighbors project to positions
+// above the diagonal line which goes through the current coefficient and these
+// positions are still all 0s according to the diagonal scan order.
+template <typename ResidualType>
+void Tile::ReadCoeffBase2D(
+    const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  for (int i = eob - 2; i >= 1; --i) {
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
+                             levels[tx_width + 1] + levels[2] +
+                             levels[MultiplyBy2(tx_width)];
+    const int context =
+        ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+        kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      int context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                          quantized[tx_width] +       // {1, 0}
+                                          quantized[tx_width + 1]));  // {1, 1}
+      context += 14 >> static_cast<int>((row | column) < 2);
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  }
+  // Read position 0.
+  {
+    auto* const quantized = &quantized_buffer[0];
+    int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
+    level_buffer[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      const int context =
+          std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                quantized[tx_width] +       // {1, 0}
+                                quantized[tx_width + 1]));  // {1, 1}
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  }
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the four right neighbors may be
+// out of boundary. We don't do the boundary check for the first three right
+// neighbors, because even for the transform blocks with smallest width 4, the
+// first three out of boundary neighbors project to positions left of the
+// current coefficient and these positions are still all 0s according to the
+// column scan order. However, when transform block width is 4 and the current
+// coefficient is on the right boundary, its fourth right neighbor projects to
+// the under position on the same column, which could be nonzero. Therefore, we
+// must skip the fourth right neighbor. To make it simple, for any coefficient,
+// we always do the boundary check for its fourth right neighbor.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseHorizontal(
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  int i = eob - 2;
+  do {
+    const uint16_t pos = scan[i];
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (levels[1] +                                  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[2] +                                  // {0, 2}
+             levels[3] +                                  // {0, 3}
+             ((column + 4 < tx_width) ? levels[4] : 0));  // {0, 4}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[column];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      int context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
+                                          quantized[tx_width] +  // {1, 0}
+                                          quantized[2]));        // {0, 2}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(column == 0);
+      }
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// Right boundary check is performed explicitly.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseVertical(
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  int i = eob - 2;
+  do {
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (((column + 1 < tx_width) ? levels[1] : 0) +  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[MultiplyBy2(tx_width)] +              // {2, 0}
+             levels[tx_width * 3] +                       // {3, 0}
+             levels[MultiplyBy4(tx_width)]);              // {4, 0}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[row];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
+      int context =
+          std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
+                                quantized[tx_width] +                // {1, 0}
+                                quantized[MultiplyBy2(tx_width)]));  // {2, 0}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(row == 0);
+      }
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
+}
+
+int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+  // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
+  int8_t dc_sign = std::accumulate(
+      dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+  dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+  dc_sign = std::accumulate(
+      dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
+  // This return statement is equivalent to:
+  //   if (dc_sign < 0) return 1;
+  //   if (dc_sign > 0) return 2;
+  //   return 0;
+  // And it is better than:
+  //   return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
+  return static_cast<int>(dc_sign < 0) +
+         MultiplyBy2(static_cast<int>(dc_sign > 0));
+}
+
+void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+                              uint8_t coefficient_level, int8_t dc_category) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+  memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
+         num_top_elements);
+  memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
+         num_top_elements);
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+  const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+  memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
+         coefficient_level, num_left_elements);
+  memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
+         num_left_elements);
+}
+
+template <typename ResidualType, bool is_dc_coefficient>
+bool Tile::ReadSignAndApplyDequantization(
+    const uint16_t* const scan, int i, int q_value,
+    const uint8_t* const quantizer_matrix, int shift, int max_value,
+    uint16_t* const dc_sign_cdf, int8_t* const dc_category,
+    int* const coefficient_level, ResidualType* residual_buffer) {
+  const int pos = is_dc_coefficient ? 0 : scan[i];
+  // If residual_buffer[pos] is zero, then the rest of the function has no
+  // effect.
+  int level = residual_buffer[pos];
+  if (level == 0) return true;
+  const int sign = is_dc_coefficient
+                       ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
+                       : reader_.ReadBit();
+  if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
+    int length = 0;
+    bool golomb_length_bit = false;
+    do {
+      golomb_length_bit = reader_.ReadBit() != 0;
+      ++length;
+      if (length > 20) {
+        LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
+        return false;
+      }
+    } while (!golomb_length_bit);
+    int x = 1;
+    for (int i = length - 2; i >= 0; --i) {
+      x = (x << 1) | reader_.ReadBit();
+    }
+    level += x - 1;
+  }
+  if (is_dc_coefficient) {
+    *dc_category = (sign != 0) ? -1 : 1;
+  }
+  level &= 0xfffff;
+  *coefficient_level += level;
+  // Apply dequantization. Step 1 of section 7.12.3 in the spec.
+  int q = q_value;
+  if (quantizer_matrix != nullptr) {
+    q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
+  }
+  // The intermediate multiplication can exceed 32 bits, so it has to be
+  // performed by promoting one of the values to int64_t.
+  int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
+  dequantized_value >>= shift;
+  // At this point:
+  //   * |dequantized_value| is always non-negative.
+  //   * |sign| can be either 0 or 1.
+  //   * min_value = -(max_value + 1).
+  // We need to apply the following:
+  // dequantized_value = sign ? -dequantized_value : dequantized_value;
+  // dequantized_value = Clip3(dequantized_value, min_value, max_value);
+  //
+  // Note that -x == ~(x - 1).
+  //
+  // Now, The above two lines can be done with a std::min and xor as follows:
+  dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
+  residual_buffer[pos] = dequantized_value;
+  return true;
+}
+
+int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
+  int level = 0;
+  for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
+    const int coeff_base_range =
+        reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
+    level += coeff_base_range;
+    if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
+  }
+  return level;
+}
+
+template <typename ResidualType>
+int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
+                                    int start_x, int start_y,
+                                    TransformSize tx_size,
+                                    TransformType* const tx_type) {
+  const int x4 = DivideBy4(start_x);
+  const int y4 = DivideBy4(start_y);
+  const int w4 = kTransformWidth4x4[tx_size];
+  const int h4 = kTransformHeight4x4[tx_size];
+  const int tx_size_context = kTransformSizeContext[tx_size];
+  int context =
+      GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
+  const bool all_zero = reader_.ReadSymbol(
+      symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
+  if (all_zero) {
+    if (plane == kPlaneY) {
+      SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
+                       transform_types_);
+    }
+    SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
+    // This is not used in this case, so it can be set to any value.
+    *tx_type = kNumTransformTypes;
+    return 0;
+  }
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_padding =
+      (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
+  auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
+  // Clear padding to avoid bottom boundary checks when parsing quantized
+  // coefficients.
+  memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
+  uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
+  memset(
+      level_buffer, 0,
+      kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
+          tx_padding);
+  const int clamped_tx_height = std::min(tx_height, 32);
+  if (plane == kPlaneY) {
+    ReadTransformType(block, x4, y4, tx_size);
+  }
+  BlockParameters& bp = *block.bp;
+  *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
+  const int eob_multi_size = kEobMultiSizeLookup[tx_size];
+  const PlaneType plane_type = GetPlaneType(plane);
+  const TransformClass tx_class = GetTransformClass(*tx_type);
+  context = static_cast<int>(tx_class != kTransformClass2D);
+  int eob_pt = 1;
+  switch (eob_multi_size) {
+    case 0:
+      eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
+          symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
+      break;
+    case 1:
+      eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
+          symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
+      break;
+    case 2:
+      eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
+          symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
+      break;
+    case 3:
+      eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
+          symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
+      break;
+    case 4:
+      eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
+          symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
+      break;
+    case 5:
+      eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
+          symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
+      break;
+    case 6:
+    default:
+      eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
+          symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
+      break;
+  }
+  int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
+  if (eob_pt >= 3) {
+    context = eob_pt - 3;
+    const bool eob_extra = reader_.ReadSymbol(
+        symbol_decoder_context_
+            .eob_extra_cdf[tx_size_context][plane_type][context]);
+    if (eob_extra) eob += 1 << (eob_pt - 3);
+    for (int i = 1; i < eob_pt - 2; ++i) {
+      assert(eob_pt - i >= 3);
+      assert(eob_pt <= kEobPt1024SymbolCount);
+      if (reader_.ReadBit() != 0) {
+        eob += 1 << (eob_pt - i - 3);
+      }
+    }
+  }
+  const uint16_t* scan = kScan[tx_class][tx_size];
+  const int clamped_tx_size_context = std::min(tx_size_context, 3);
+  auto coeff_base_range_cdf =
+      symbol_decoder_context_
+          .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
+  // Read the last coefficient.
+  {
+    context = GetCoeffBaseContextEob(tx_size, eob - 1);
+    const uint16_t pos = scan[eob - 1];
+    int level =
+        1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
+                symbol_decoder_context_
+                    .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
+    level_buffer[pos] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      level +=
+          ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
+              adjusted_tx_width_log2, pos, tx_class)]);
+    }
+    residual[pos] = level;
+  }
+  if (eob > 1) {
+    // Read all the other coefficients.
+    // Lookup used to call the right variant of ReadCoeffBase*() based on the
+    // transform class.
+    static constexpr void (Tile::*kGetCoeffBaseFunc[])(
+        const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+        int eob,
+        uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+        uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                     [kCoeffBaseRangeSymbolCount + 1],
+        ResidualType* quantized_buffer,
+        uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
+                                  &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+                                  &Tile::ReadCoeffBaseVertical<ResidualType>};
+    (this->*kGetCoeffBaseFunc[tx_class])(
+        scan, tx_size, adjusted_tx_width_log2, eob,
+        symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
+        coeff_base_range_cdf, residual, level_buffer);
+  }
+  const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
+  const int current_quantizer_index =
+      GetQIndex(frame_header_.segmentation,
+                bp.prediction_parameters->segment_id, current_quantizer_index_);
+  const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
+  const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
+  const int shift = kQuantizationShift[tx_size];
+  const uint8_t* const quantizer_matrix =
+      (frame_header_.quantizer.use_matrix &&
+       *tx_type < kTransformTypeIdentityIdentity &&
+       !frame_header_.segmentation
+            .lossless[bp.prediction_parameters->segment_id] &&
+       frame_header_.quantizer.matrix_level[plane] < 15)
+          ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
+                             [plane_type][adjusted_tx_size]
+                                 .get()
+          : nullptr;
+  int coefficient_level = 0;
+  int8_t dc_category = 0;
+  uint16_t* const dc_sign_cdf =
+      (residual[0] != 0)
+          ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
+                x4, y4, w4, h4, plane)]
+          : nullptr;
+  assert(scan[0] == 0);
+  if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
+          scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
+          &dc_category, &coefficient_level, residual)) {
+    return -1;
+  }
+  if (eob > 1) {
+    int i = 1;
+    do {
+      if (!ReadSignAndApplyDequantization<ResidualType,
+                                          /*is_dc_coefficient=*/false>(
+              scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
+              nullptr, &coefficient_level, residual)) {
+        return -1;
+      }
+    } while (++i < eob);
+    MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
+  }
+  SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
+                     dc_category);
+  if (split_parse_and_decode_) {
+    *block.residual += tx_width * tx_height * residual_size_;
+  }
+  return eob;
+}
+
+// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
+// |function| depending on the value of |sequence_header_.color_config.bitdepth|
+// with the variadic arguments.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+#define CALL_BITDEPTH_FUNCTION(function, ...)         \
+  do {                                                \
+    if (sequence_header_.color_config.bitdepth > 8) { \
+      function<uint16_t>(__VA_ARGS__);                \
+    } else {                                          \
+      function<uint8_t>(__VA_ARGS__);                 \
+    }                                                 \
+  } while (false)
+#else
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+  do {                                        \
+    function<uint8_t>(__VA_ARGS__);           \
+  } while (false)
+#endif
+
+bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
+                          int base_y, TransformSize tx_size, int x, int y,
+                          ProcessingMode mode) {
+  BlockParameters& bp = *block.bp;
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  const int start_x = base_x + MultiplyBy4(x);
+  const int start_y = base_y + MultiplyBy4(y);
+  const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+  const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+  if (start_x >= max_x || start_y >= max_y) return true;
+  const int row = DivideBy4(start_y << subsampling_y);
+  const int column = DivideBy4(start_x << subsampling_x);
+  const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
+  const int sub_block_row4x4 = row & mask;
+  const int sub_block_column4x4 = column & mask;
+  const int step_x = kTransformWidth4x4[tx_size];
+  const int step_y = kTransformHeight4x4[tx_size];
+  const bool do_decode = mode == kProcessingModeDecodeOnly ||
+                         mode == kProcessingModeParseAndDecode;
+  if (do_decode && !bp.is_inter) {
+    if (bp.prediction_parameters->palette_mode_info.size[GetPlaneType(plane)] >
+        0) {
+      CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
+                             x, y, tx_size);
+    } else {
+      const PredictionMode mode =
+          (plane == kPlaneY) ? bp.y_mode
+                             : (bp.prediction_parameters->uv_mode ==
+                                        kPredictionModeChromaFromLuma
+                                    ? kPredictionModeDc
+                                    : bp.prediction_parameters->uv_mode);
+      const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
+      const int tr_column4x4 =
+          (sub_block_column4x4 >> subsampling_x) + step_x + 1;
+      const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
+      const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
+      const bool has_left = x > 0 || block.left_available[plane];
+      const bool has_top = y > 0 || block.top_available[plane];
+
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          mode, tx_size);
+      if (plane != kPlaneY &&
+          bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
+        CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
+                               start_y, tx_size);
+      }
+    }
+    if (plane == kPlaneY) {
+      block.bp->prediction_parameters->max_luma_width =
+          start_x + MultiplyBy4(step_x);
+      block.bp->prediction_parameters->max_luma_height =
+          start_y + MultiplyBy4(step_y);
+      block.scratch_buffer->cfl_luma_buffer_valid = false;
+    }
+  }
+  if (!bp.skip) {
+    const int sb_row_index = SuperBlockRowIndex(block.row4x4);
+    const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
+    if (mode == kProcessingModeDecodeOnly) {
+      Queue<TransformParameters>& tx_params =
+          *residual_buffer_threaded_[sb_row_index][sb_column_index]
+               ->transform_parameters();
+      ReconstructBlock(block, plane, start_x, start_y, tx_size,
+                       tx_params.Front().type,
+                       tx_params.Front().non_zero_coeff_count);
+      tx_params.Pop();
+    } else {
+      TransformType tx_type;
+      int non_zero_coeff_count;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (sequence_header_.color_config.bitdepth > 8) {
+        non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
+            block, plane, start_x, start_y, tx_size, &tx_type);
+      } else  // NOLINT
+#endif
+      {
+        non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
+            block, plane, start_x, start_y, tx_size, &tx_type);
+      }
+      if (non_zero_coeff_count < 0) return false;
+      if (mode == kProcessingModeParseAndDecode) {
+        ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
+                         non_zero_coeff_count);
+      } else {
+        assert(mode == kProcessingModeParseOnly);
+        residual_buffer_threaded_[sb_row_index][sb_column_index]
+            ->transform_parameters()
+            ->Push(TransformParameters(tx_type, non_zero_coeff_count));
+      }
+    }
+  }
+  if (do_decode) {
+    bool* block_decoded =
+        &block.scratch_buffer
+             ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
+                            [(sub_block_column4x4 >> subsampling_x) + 1];
+    SetBlockValues<bool>(step_y, step_x, true, block_decoded,
+                         TileScratchBuffer::kBlockDecodedStride);
+  }
+  return true;
+}
+
+bool Tile::TransformTree(const Block& block, int start_x, int start_y,
+                         BlockSize plane_size, ProcessingMode mode) {
+  assert(plane_size <= kBlock64x64);
+  // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
+  // required is (4 - 1) * 4 + 1 = 13.
+  Stack<TransformTreeNode, 13> stack;
+  // It is okay to cast BlockSize to TransformSize here since the enum are
+  // equivalent for all BlockSize values <= kBlock64x64.
+  stack.Push(TransformTreeNode(start_x, start_y,
+                               static_cast<TransformSize>(plane_size)));
+
+  do {
+    TransformTreeNode node = stack.Pop();
+    const int row = DivideBy4(node.y);
+    const int column = DivideBy4(node.x);
+    if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
+      continue;
+    }
+    const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
+    const int width = kTransformWidth[node.tx_size];
+    const int height = kTransformHeight[node.tx_size];
+    if (width <= kTransformWidth[inter_tx_size] &&
+        height <= kTransformHeight[inter_tx_size]) {
+      if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
+                          mode)) {
+        return false;
+      }
+      continue;
+    }
+    // The split transform size look up gives the right transform size that we
+    // should push in the stack.
+    //   if (width > height) => transform size whose width is half.
+    //   if (width < height) => transform size whose height is half.
+    //   if (width == height) => transform size whose width and height are half.
+    const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
+    const int half_width = DivideBy2(width);
+    if (width > height) {
+      stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+      stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+      continue;
+    }
+    const int half_height = DivideBy2(height);
+    if (width < height) {
+      stack.Push(
+          TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+      stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+      continue;
+    }
+    stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
+                                 split_tx_size));
+    stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+    stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+    stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+  } while (!stack.Empty());
+  return true;
+}
+
+void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
+                            int start_y, TransformSize tx_size,
+                            TransformType tx_type, int non_zero_coeff_count) {
+  // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
+  assert(non_zero_coeff_count >= 0);
+  if (non_zero_coeff_count == 0) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (sequence_header_.color_config.bitdepth > 8) {
+    Array2DView<uint16_t> buffer(
+        buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+        reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+    Reconstruct(dsp_, tx_type, tx_size,
+                frame_header_.segmentation
+                    .lossless[block.bp->prediction_parameters->segment_id],
+                reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
+                &buffer, non_zero_coeff_count);
+  } else  // NOLINT
+#endif
+  {
+    Reconstruct(dsp_, tx_type, tx_size,
+                frame_header_.segmentation
+                    .lossless[block.bp->prediction_parameters->segment_id],
+                reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
+                &buffer_[plane], non_zero_coeff_count);
+  }
+  if (split_parse_and_decode_) {
+    *block.residual +=
+        kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
+  }
+}
+
+bool Tile::Residual(const Block& block, ProcessingMode mode) {
+  const int width_chunks = std::max(1, block.width >> 6);
+  const int height_chunks = std::max(1, block.height >> 6);
+  const BlockSize size_chunk4x4 =
+      (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
+  const BlockParameters& bp = *block.bp;
+  for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
+    for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
+      const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+      int plane = kPlaneY;
+      do {
+        const int subsampling_x = subsampling_x_[plane];
+        const int subsampling_y = subsampling_y_[plane];
+        // For Y Plane, when lossless is true |bp.transform_size| is always
+        // kTransformSize4x4. So we can simply use |bp.transform_size| here as
+        // the Y plane's transform size (part of Section 5.11.37 in the spec).
+        const TransformSize tx_size =
+            (plane == kPlaneY)
+                ? inter_transform_sizes_[block.row4x4][block.column4x4]
+                : bp.uv_transform_size;
+        const BlockSize plane_size =
+            kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
+        assert(plane_size != kBlockInvalid);
+        if (bp.is_inter &&
+            !frame_header_.segmentation
+                 .lossless[bp.prediction_parameters->segment_id] &&
+            plane == kPlaneY) {
+          const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
+          const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
+          const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
+          const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
+          if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
+            return false;
+          }
+        } else {
+          const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+          const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+          const int step_x = kTransformWidth4x4[tx_size];
+          const int step_y = kTransformHeight4x4[tx_size];
+          const int num4x4_wide = kNum4x4BlocksWide[plane_size];
+          const int num4x4_high = kNum4x4BlocksHigh[plane_size];
+          for (int y = 0; y < num4x4_high; y += step_y) {
+            for (int x = 0; x < num4x4_wide; x += step_x) {
+              if (!TransformBlock(
+                      block, static_cast<Plane>(plane), base_x, base_y, tx_size,
+                      x + (MultiplyBy16(chunk_x) >> subsampling_x),
+                      y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
+                return false;
+              }
+            }
+          }
+        }
+      } while (++plane < num_planes);
+    }
+  }
+  return true;
+}
+
+// The purpose of this function is to limit the maximum size of motion vectors
+// and also, if use_intra_block_copy is true, to additionally constrain the
+// motion vector so that the data is fetched from parts of the tile that have
+// already been decoded and are not too close to the current block (in order to
+// make a pipelined decoder implementation feasible).
+bool Tile::IsMvValid(const Block& block, bool is_compound) const {
+  const BlockParameters& bp = *block.bp;
+  for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
+    for (int mv_component : bp.mv.mv[i].mv) {
+      if (std::abs(mv_component) >= (1 << 14)) {
+        return false;
+      }
+    }
+  }
+  if (!block.bp->prediction_parameters->use_intra_block_copy) {
+    return true;
+  }
+  if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
+    return false;
+  }
+  const int delta_row = bp.mv.mv[0].mv[0] >> 3;
+  const int delta_column = bp.mv.mv[0].mv[1] >> 3;
+  int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
+  int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
+  const int src_bottom_edge = src_top_edge + block.height;
+  const int src_right_edge = src_left_edge + block.width;
+  if (block.HasChroma()) {
+    if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
+      src_left_edge -= 4;
+    }
+    if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
+      src_top_edge -= 4;
+    }
+  }
+  if (src_top_edge < MultiplyBy4(row4x4_start_) ||
+      src_left_edge < MultiplyBy4(column4x4_start_) ||
+      src_bottom_edge > MultiplyBy4(row4x4_end_) ||
+      src_right_edge > MultiplyBy4(column4x4_end_)) {
+    return false;
+  }
+  // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
+  const int sb_height_log2 =
+      6 + static_cast<int>(sequence_header_.use_128x128_superblock);
+  const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
+  const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
+  const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
+  const int src_64x64_block_column = (src_right_edge - 1) >> 6;
+  const int total_64x64_blocks_per_row =
+      ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
+  const int active_64x64_block =
+      active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
+  const int src_64x64_block =
+      src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
+  if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
+    return false;
+  }
+
+  // Wavefront constraint: use only top left area of frame for reference.
+  if (src_sb_row > active_sb_row) return false;
+  const int gradient =
+      1 + kIntraBlockCopyDelay64x64Blocks +
+      static_cast<int>(sequence_header_.use_128x128_superblock);
+  const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
+  return src_64x64_block_column < active_64x64_block_column -
+                                      kIntraBlockCopyDelay64x64Blocks +
+                                      wavefront_offset;
+}
+
+bool Tile::AssignInterMv(const Block& block, bool is_compound) {
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
+  BlockParameters& bp = *block.bp;
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  bp.mv.mv64 = 0;
+  if (is_compound) {
+    for (int i = 0; i < 2; ++i) {
+      const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
+      MotionVector predicted_mv;
+      if (mode == kPredictionModeGlobalMv) {
+        predicted_mv = prediction_parameters.global_mv[i];
+      } else {
+        const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+                                  (mode == kPredictionModeNewMv &&
+                                   prediction_parameters.ref_mv_count <= 1))
+                                     ? 0
+                                     : prediction_parameters.ref_mv_index;
+        predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
+        if (ref_mv_index < prediction_parameters.ref_mv_count) {
+          predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+          predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+        }
+      }
+      if (mode == kPredictionModeNewMv) {
+        ReadMotionVector(block, i);
+        bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
+        bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
+      } else {
+        bp.mv.mv[i] = predicted_mv;
+      }
+    }
+  } else {
+    const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
+    MotionVector predicted_mv;
+    if (mode == kPredictionModeGlobalMv) {
+      predicted_mv = prediction_parameters.global_mv[0];
+    } else {
+      const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+                                (mode == kPredictionModeNewMv &&
+                                 prediction_parameters.ref_mv_count <= 1))
+                                   ? 0
+                                   : prediction_parameters.ref_mv_index;
+      predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
+      if (ref_mv_index < prediction_parameters.ref_mv_count) {
+        predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+        predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+      }
+    }
+    if (mode == kPredictionModeNewMv) {
+      ReadMotionVector(block, 0);
+      bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
+      bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
+    } else {
+      bp.mv.mv[0] = predicted_mv;
+    }
+  }
+  return IsMvValid(block, is_compound);
+}
+
+bool Tile::AssignIntraMv(const Block& block) {
+  // TODO(linfengz): Check if the clamping process is necessary.
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
+  BlockParameters& bp = *block.bp;
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+  bp.mv.mv64 = 0;
+  ReadMotionVector(block, 0);
+  if (ref_mv_0.mv32 == 0) {
+    const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
+    if (ref_mv_1.mv32 == 0) {
+      const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
+      if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
+        bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
+        bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
+      } else {
+        bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
+      }
+    } else {
+      bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
+      bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
+    }
+  } else {
+    bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
+    bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
+  }
+  return IsMvValid(block, /*is_compound=*/false);
+}
+
+void Tile::ResetEntropyContext(const Block& block) {
+  const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+  int plane = kPlaneY;
+  do {
+    const int subsampling_x = subsampling_x_[plane];
+    const int start_x = block.column4x4 >> subsampling_x;
+    const int end_x =
+        std::min((block.column4x4 + block.width4x4) >> subsampling_x,
+                 frame_header_.columns4x4);
+    memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
+           end_x - start_x);
+    memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
+           end_x - start_x);
+    const int subsampling_y = subsampling_y_[plane];
+    const int start_y = block.row4x4 >> subsampling_y;
+    const int end_y =
+        std::min((block.row4x4 + block.height4x4) >> subsampling_y,
+                 frame_header_.rows4x4);
+    memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
+           end_y - start_y);
+    memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
+           end_y - start_y);
+  } while (++plane < num_planes);
+}
+
+bool Tile::ComputePrediction(const Block& block) {
+  const BlockParameters& bp = *block.bp;
+  if (!bp.is_inter) return true;
+  const int mask =
+      (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
+      1;
+  const int sub_block_row4x4 = block.row4x4 & mask;
+  const int sub_block_column4x4 = block.column4x4 & mask;
+  const int plane_count = block.HasChroma() ? PlaneCount() : 1;
+  // Returns true if this block applies local warping. The state is determined
+  // in the Y plane and carried for use in the U/V planes.
+  // But the U/V planes will not apply warping when the block size is smaller
+  // than 8x8, even if this variable is true.
+  bool is_local_valid = false;
+  // Local warping parameters, similar usage as is_local_valid.
+  GlobalMotion local_warp_params;
+  int plane = kPlaneY;
+  do {
+    const int8_t subsampling_x = subsampling_x_[plane];
+    const int8_t subsampling_y = subsampling_y_[plane];
+    const BlockSize plane_size = block.residual_size[plane];
+    const int block_width4x4 = kNum4x4BlocksWide[plane_size];
+    const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
+    const int block_width = MultiplyBy4(block_width4x4);
+    const int block_height = MultiplyBy4(block_height4x4);
+    const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+    const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+    if (bp.reference_frame[1] == kReferenceFrameIntra) {
+      const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
+      const int tr_column4x4 =
+          (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
+      const int bl_row4x4 =
+          (sub_block_row4x4 >> subsampling_y) + block_height4x4;
+      const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
+      const TransformSize tx_size =
+          k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
+                                 [k4x4HeightLog2[plane_size]];
+      const bool has_left = block.left_available[plane];
+      const bool has_top = block.top_available[plane];
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
+          has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          kInterIntraToIntraMode[block.bp->prediction_parameters
+                                     ->inter_intra_mode],
+          tx_size);
+    }
+    int candidate_row = block.row4x4;
+    int candidate_column = block.column4x4;
+    bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
+    if (!some_use_intra && plane != 0) {
+      candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
+      candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
+      if (candidate_row != block.row4x4) {
+        // Top block.
+        const BlockParameters& bp_top =
+            *block_parameters_holder_.Find(candidate_row, block.column4x4);
+        some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
+        if (!some_use_intra && candidate_column != block.column4x4) {
+          // Top-left block.
+          const BlockParameters& bp_top_left =
+              *block_parameters_holder_.Find(candidate_row, candidate_column);
+          some_use_intra =
+              bp_top_left.reference_frame[0] == kReferenceFrameIntra;
+        }
+      }
+      if (!some_use_intra && candidate_column != block.column4x4) {
+        // Left block.
+        const BlockParameters& bp_left =
+            *block_parameters_holder_.Find(block.row4x4, candidate_column);
+        some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
+      }
+    }
+    int prediction_width;
+    int prediction_height;
+    if (some_use_intra) {
+      candidate_row = block.row4x4;
+      candidate_column = block.column4x4;
+      prediction_width = block_width;
+      prediction_height = block_height;
+    } else {
+      prediction_width = block.width >> subsampling_x;
+      prediction_height = block.height >> subsampling_y;
+    }
+    int r = 0;
+    int y = 0;
+    do {
+      int c = 0;
+      int x = 0;
+      do {
+        if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
+                             base_y + y, prediction_width, prediction_height,
+                             candidate_row + r, candidate_column + c,
+                             &is_local_valid, &local_warp_params)) {
+          return false;
+        }
+        ++c;
+        x += prediction_width;
+      } while (x < block_width);
+      ++r;
+      y += prediction_height;
+    } while (y < block_height);
+  } while (++plane < plane_count);
+  return true;
+}
+
+#undef CALL_BITDEPTH_FUNCTION
+
+void Tile::PopulateDeblockFilterLevel(const Block& block) {
+  if (!post_filter_.DoDeblock()) return;
+  BlockParameters& bp = *block.bp;
+  const int mode_id =
+      static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
+  for (int i = 0; i < kFrameLfCount; ++i) {
+    if (delta_lf_all_zero_) {
+      bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
+          bp.prediction_parameters->segment_id, i, bp.reference_frame[0],
+          mode_id);
+    } else {
+      bp.deblock_filter_level[i] =
+          deblock_filter_levels_[bp.prediction_parameters->segment_id][i]
+                                [bp.reference_frame[0]][mode_id];
+    }
+  }
+}
+
+void Tile::PopulateCdefSkip(const Block& block) {
+  if (!post_filter_.DoCdef() || block.bp->skip ||
+      (frame_header_.cdef.bits > 0 &&
+       cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)] ==
+           -1)) {
+    return;
+  }
+  // The rest of this function is an efficient version of the following code:
+  // for (int y = block.row4x4; y < block.row4x4 + block.height4x4; y++) {
+  //   for (int x = block.column4x4; y < block.column4x4 + block.width4x4;
+  //        x++) {
+  //     const uint8_t mask = uint8_t{1} << ((x >> 1) & 0x7);
+  //     cdef_skip_[y >> 1][x >> 4] |= mask;
+  //   }
+  // }
+
+  // For all block widths other than 32, the mask will fit in uint8_t. For
+  // block width == 32, the mask is always 0xFFFF.
+  const int bw4 =
+      std::max(DivideBy2(block.width4x4) + (block.column4x4 & 1), 1);
+  const uint8_t mask = (block.width4x4 == 32)
+                           ? 0xFF
+                           : (uint8_t{0xFF} >> (8 - bw4))
+                                 << (DivideBy2(block.column4x4) & 0x7);
+  uint8_t* cdef_skip = &cdef_skip_[block.row4x4 >> 1][block.column4x4 >> 4];
+  const int stride = cdef_skip_.columns();
+  int row = 0;
+  do {
+    *cdef_skip |= mask;
+    if (block.width4x4 == 32) {
+      *(cdef_skip + 1) = 0xFF;
+    }
+    cdef_skip += stride;
+    row += 2;
+  } while (row < block.height4x4);
+}
+
+bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+                        TileScratchBuffer* const scratch_buffer,
+                        ResidualPtr* residual) {
+  // Do not process the block if the starting point is beyond the visible frame.
+  // This is equivalent to the has_row/has_column check in the
+  // decode_partition() section of the spec when partition equals
+  // kPartitionHorizontal or kPartitionVertical.
+  if (row4x4 >= frame_header_.rows4x4 ||
+      column4x4 >= frame_header_.columns4x4) {
+    return true;
+  }
+
+  if (split_parse_and_decode_) {
+    // Push block ordering info to the queue. DecodeBlock() will use this queue
+    // to decode the blocks in the correct order.
+    const int sb_row_index = SuperBlockRowIndex(row4x4);
+    const int sb_column_index = SuperBlockColumnIndex(column4x4);
+    residual_buffer_threaded_[sb_row_index][sb_column_index]
+        ->partition_tree_order()
+        ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
+  }
+
+  BlockParameters* bp_ptr =
+      block_parameters_holder_.Get(row4x4, column4x4, block_size);
+  if (bp_ptr == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
+    return false;
+  }
+  BlockParameters& bp = *bp_ptr;
+  Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  bp.size = block_size;
+  bp.prediction_parameters =
+      split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
+                                    new (std::nothrow) PredictionParameters())
+                              : std::move(prediction_parameters_);
+  if (bp.prediction_parameters == nullptr) return false;
+  if (!DecodeModeInfo(block)) return false;
+  PopulateDeblockFilterLevel(block);
+  if (!ReadPaletteTokens(block)) return false;
+  DecodeTransformSize(block);
+  // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
+  bp.uv_transform_size =
+      frame_header_.segmentation.lossless[bp.prediction_parameters->segment_id]
+          ? kTransformSize4x4
+          : kUVTransformSize[block.residual_size[kPlaneU]];
+  if (bp.skip) ResetEntropyContext(block);
+  PopulateCdefSkip(block);
+  if (split_parse_and_decode_) {
+    if (!Residual(block, kProcessingModeParseOnly)) return false;
+  } else {
+    if (!ComputePrediction(block) ||
+        !Residual(block, kProcessingModeParseAndDecode)) {
+      return false;
+    }
+  }
+  // If frame_header_.segmentation.enabled is false,
+  // bp.prediction_parameters->segment_id is 0 for all blocks. We don't need to
+  // call save bp.prediction_parameters->segment_id in the current frame because
+  // the current frame's segmentation map will be cleared to all 0s.
+  //
+  // If frame_header_.segmentation.enabled is true and
+  // frame_header_.segmentation.update_map is false, we will copy the previous
+  // frame's segmentation map to the current frame. So we don't need to call
+  // save bp.prediction_parameters->segment_id in the current frame.
+  if (frame_header_.segmentation.enabled &&
+      frame_header_.segmentation.update_map) {
+    const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
+                                 static_cast<int>(block.width4x4));
+    const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
+                                 static_cast<int>(block.height4x4));
+    current_frame_.segmentation_map()->FillBlock(
+        row4x4, column4x4, x_limit, y_limit,
+        bp.prediction_parameters->segment_id);
+  }
+  StoreMotionFieldMvsIntoCurrentFrame(block);
+  if (!split_parse_and_decode_) {
+    prediction_parameters_ = std::move(bp.prediction_parameters);
+  }
+  return true;
+}
+
+bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+                       TileScratchBuffer* const scratch_buffer,
+                       ResidualPtr* residual) {
+  if (row4x4 >= frame_header_.rows4x4 ||
+      column4x4 >= frame_header_.columns4x4) {
+    return true;
+  }
+  Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  if (!ComputePrediction(block) ||
+      !Residual(block, kProcessingModeDecodeOnly)) {
+    return false;
+  }
+  block.bp->prediction_parameters.reset(nullptr);
+  return true;
+}
+
+bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
+                            TileScratchBuffer* const scratch_buffer,
+                            ResidualPtr* residual) {
+  Stack<PartitionTreeNode, kDfsStackSize> stack;
+
+  // Set up the first iteration.
+  stack.Push(
+      PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
+
+  // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
+  // Otherwise, the children are pushed into the stack for future processing.
+  do {
+    PartitionTreeNode node = stack.Pop();
+    int row4x4 = node.row4x4;
+    int column4x4 = node.column4x4;
+    BlockSize block_size = node.block_size;
+
+    if (row4x4 >= frame_header_.rows4x4 ||
+        column4x4 >= frame_header_.columns4x4) {
+      continue;
+    }
+    const int block_width4x4 = kNum4x4BlocksWide[block_size];
+    assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
+    const int half_block4x4 = block_width4x4 >> 1;
+    const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
+    const bool has_columns =
+        (column4x4 + half_block4x4) < frame_header_.columns4x4;
+    Partition partition;
+    if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
+                       &partition)) {
+      LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+    const BlockSize sub_size = kSubSize[partition][block_size];
+    // Section 6.10.4: It is a requirement of bitstream conformance that
+    // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
+    // every time subSize is computed.
+    if (sub_size == kBlockInvalid ||
+        kPlaneResidualSize[sub_size]
+                          [sequence_header_.color_config.subsampling_x]
+                          [sequence_header_.color_config.subsampling_y] ==
+            kBlockInvalid) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "Invalid sub-block/plane size for row: %d column: %d partition: "
+          "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
+          row4x4, column4x4, partition, block_size, sub_size,
+          sequence_header_.color_config.subsampling_x,
+          sequence_header_.color_config.subsampling_y);
+      return false;
+    }
+
+    const int quarter_block4x4 = half_block4x4 >> 1;
+    const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
+    assert(partition == kPartitionNone || sub_size != kBlockInvalid);
+    switch (partition) {
+      case kPartitionNone:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual)) {
+          return false;
+        }
+        break;
+      case kPartitionSplit:
+        // The children must be added in reverse order since a stack is being
+        // used.
+        stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
+                                     column4x4 + half_block4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
+        stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
+        break;
+      case kPartitionHorizontal:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionVertical:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionHorizontalWithTopSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionHorizontalWithBottomSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionVerticalWithLeftSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionVerticalWithRightSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionHorizontal4:
+        for (int i = 0; i < 4; ++i) {
+          if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
+                            scratch_buffer, residual)) {
+            return false;
+          }
+        }
+        break;
+      case kPartitionVertical4:
+        for (int i = 0; i < 4; ++i) {
+          if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
+                            scratch_buffer, residual)) {
+            return false;
+          }
+        }
+        break;
+    }
+  } while (!stack.Empty());
+  return true;
+}
+
+void Tile::ResetLoopRestorationParams() {
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+      reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
+          kSgrProjDefaultMultiplier[i];
+      for (int j = 0; j < kNumWienerCoefficients; ++j) {
+        reference_unit_info_[plane].wiener_info.filter[i][j] =
+            kWienerDefaultFilter[j];
+      }
+    }
+  }
+}
+
+void Tile::ResetCdef(const int row4x4, const int column4x4) {
+  if (frame_header_.cdef.bits == 0) return;
+  const int row = DivideBy16(row4x4);
+  const int column = DivideBy16(column4x4);
+  cdef_index_[row][column] = -1;
+  if (sequence_header_.use_128x128_superblock) {
+    const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
+    const int border_row = DivideBy16(row4x4 + cdef_size4x4);
+    const int border_column = DivideBy16(column4x4 + cdef_size4x4);
+    cdef_index_[row][border_column] = -1;
+    cdef_index_[border_row][column] = -1;
+    cdef_index_[border_row][border_column] = -1;
+  }
+}
+
+void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
+                             int row4x4, int column4x4) {
+  // Set everything to false.
+  memset(scratch_buffer->block_decoded, 0,
+         sizeof(scratch_buffer->block_decoded));
+  // Set specific edge cases to true.
+  const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    const int subsampling_x = subsampling_x_[plane];
+    const int subsampling_y = subsampling_y_[plane];
+    const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
+    const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
+    // The memset is equivalent to the following lines in the spec:
+    // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
+    //   if ( y < 0 && x < sbWidth4 ) {
+    //     BlockDecoded[plane][y][x] = 1
+    //   }
+    // }
+    const int num_elements =
+        std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
+    memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
+    // The for loop is equivalent to the following lines in the spec:
+    // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
+    //   if ( x < 0 && y < sbHeight4 )
+    //     BlockDecoded[plane][y][x] = 1
+    //   }
+    // }
+    // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
+    for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
+         ++y) {
+      scratch_buffer->block_decoded[plane][y + 1][0] = true;
+    }
+  }
+}
+
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
+                             TileScratchBuffer* const scratch_buffer,
+                             ProcessingMode mode) {
+  const bool parsing =
+      mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
+  const bool decoding = mode == kProcessingModeDecodeOnly ||
+                        mode == kProcessingModeParseAndDecode;
+  if (parsing) {
+    read_deltas_ = frame_header_.delta_q.present;
+    ResetCdef(row4x4, column4x4);
+  }
+  if (decoding) {
+    ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
+  }
+  const BlockSize block_size = SuperBlockSize();
+  if (parsing) {
+    ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
+  }
+  if (parsing && decoding) {
+    uint8_t* residual_buffer = residual_buffer_.get();
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
+                   column4x4);
+      return false;
+    }
+    return true;
+  }
+  const int sb_row_index = SuperBlockRowIndex(row4x4);
+  const int sb_column_index = SuperBlockColumnIndex(column4x4);
+  if (parsing) {
+    residual_buffer_threaded_[sb_row_index][sb_column_index] =
+        residual_buffer_pool_->Get();
+    if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
+      return false;
+    }
+    uint8_t* residual_buffer =
+        residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
+                   column4x4);
+      return false;
+    }
+  } else {
+    if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+    residual_buffer_pool_->Release(
+        std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
+  }
+  return true;
+}
+
+bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                            TileScratchBuffer* const scratch_buffer) {
+  uint8_t* residual_buffer =
+      residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+  Queue<PartitionTreeNode>& partition_tree_order =
+      *residual_buffer_threaded_[sb_row_index][sb_column_index]
+           ->partition_tree_order();
+  while (!partition_tree_order.Empty()) {
+    PartitionTreeNode block = partition_tree_order.Front();
+    if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
+                     scratch_buffer, &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
+                   block.row4x4, block.column4x4);
+      return false;
+    }
+    partition_tree_order.Pop();
+  }
+  return true;
+}
+
+void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+                                           BlockSize block_size) {
+  if (frame_header_.allow_intrabc) return;
+  LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
+  const bool is_superres_scaled =
+      frame_header_.width != frame_header_.upscaled_width;
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    LoopRestorationUnitInfo unit_info;
+    if (restoration_info->PopulateUnitInfoForSuperBlock(
+            static_cast<Plane>(plane), block_size, is_superres_scaled,
+            frame_header_.superres_scale_denominator, row4x4, column4x4,
+            &unit_info)) {
+      for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
+           ++unit_row) {
+        for (int unit_column = unit_info.column_start;
+             unit_column < unit_info.column_end; ++unit_column) {
+          const int unit_id = unit_row * restoration_info->num_horizontal_units(
+                                             static_cast<Plane>(plane)) +
+                              unit_column;
+          restoration_info->ReadUnitCoefficients(
+              &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
+              unit_id, &reference_unit_info_);
+        }
+      }
+    }
+  }
+}
+
+void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
+  if (frame_header_.refresh_frame_flags == 0 ||
+      IsIntraFrame(frame_header_.frame_type)) {
+    return;
+  }
+  // Iterate over odd rows/columns beginning at the first odd row/column for the
+  // block. It is done this way because motion field mvs are only needed at a
+  // 8x8 granularity.
+  const int row_start4x4 = block.row4x4 | 1;
+  const int row_limit4x4 =
+      std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+  if (row_start4x4 >= row_limit4x4) return;
+  const int column_start4x4 = block.column4x4 | 1;
+  const int column_limit4x4 =
+      std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+  if (column_start4x4 >= column_limit4x4) return;
+
+  // The largest reference MV component that can be saved.
+  constexpr int kRefMvsLimit = (1 << 12) - 1;
+  const BlockParameters& bp = *block.bp;
+  ReferenceInfo* reference_info = current_frame_.reference_info();
+  for (int i = 1; i >= 0; --i) {
+    const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
+    if (reference_frame_to_store <= kReferenceFrameIntra) continue;
+    // Must make a local copy so that StoreMotionFieldMvs() knows there is no
+    // overlap between load and store.
+    const MotionVector mv_to_store = bp.mv.mv[i];
+    const int mv_row = std::abs(mv_to_store.mv[0]);
+    const int mv_column = std::abs(mv_to_store.mv[1]);
+    // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two absolute
+    // values and then compare with kRefMvsLimit to save a branch.
+    // The next line is equivalent to:
+    // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
+    if ((mv_row | mv_column) <= kRefMvsLimit &&
+        reference_info->relative_distance_from[reference_frame_to_store] < 0) {
+      const int row_start8x8 = DivideBy2(row_start4x4);
+      const int row_limit8x8 = DivideBy2(row_limit4x4);
+      const int column_start8x8 = DivideBy2(column_start4x4);
+      const int column_limit8x8 = DivideBy2(column_limit4x4);
+      const int rows = row_limit8x8 - row_start8x8;
+      const int columns = column_limit8x8 - column_start8x8;
+      const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
+      ReferenceFrameType* const reference_frame_row_start =
+          &reference_info
+               ->motion_field_reference_frame[row_start8x8][column_start8x8];
+      MotionVector* const mv =
+          &reference_info->motion_field_mv[row_start8x8][column_start8x8];
+
+      // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
+      // and simplifies std::fill() for these cases.
+      if (columns <= 1) {
+        // Don't change the above condition to (columns == 1).
+        // Condition (columns <= 1) may help the compiler simplify the inlining
+        // of the general case of StoreMotionFieldMvs() by eliminating the
+        // (columns == 0) case.
+        assert(columns == 1);
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            1, reference_frame_row_start, mv);
+      } else if (columns == 2) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            2, reference_frame_row_start, mv);
+      } else if (columns == 4) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            4, reference_frame_row_start, mv);
+      } else if (columns == 8) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            8, reference_frame_row_start, mv);
+      } else if (columns == 16) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            16, reference_frame_row_start, mv);
+      } else if (columns < 16) {
+        // This always true condition (columns < 16) may help the compiler
+        // simplify the inlining of the following function.
+        // This general case is rare and usually only happens to the blocks
+        // which contain the right boundary of the frame.
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            columns, reference_frame_row_start, mv);
+      } else {
+        assert(false);
+      }
+      return;
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/tile_scratch_buffer.cc b/src/tile_scratch_buffer.cc
new file mode 100644 (file)
index 0000000..0b5ac96
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile_scratch_buffer.h"
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+#if !LIBGAV1_CXX17
+// static
+constexpr int TileScratchBuffer::kBlockDecodedStride;
+#endif
+
+}  // namespace libgav1
diff --git a/src/tile_scratch_buffer.h b/src/tile_scratch_buffer.h
new file mode 100644 (file)
index 0000000..828f550
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <new>
+#include <utility>
+
+#include "src/dsp/constants.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+
+// Buffer to facilitate decoding a superblock.
+struct TileScratchBuffer : public MaxAlignedAllocable {
+  static constexpr int kBlockDecodedStride = 34;
+
+  LIBGAV1_MUST_USE_RESULT bool Init(int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    const int pixel_size = (bitdepth == 8) ? 1 : 2;
+#else
+    assert(bitdepth == 8);
+    static_cast<void>(bitdepth);
+    const int pixel_size = 1;
+#endif
+
+    static_assert(kConvolveScaleBorderRight >= kConvolveBorderRight, "");
+    constexpr int unaligned_convolve_buffer_stride =
+        kMaxScaledSuperBlockSizeInPixels + kConvolveBorderLeftTop +
+        kConvolveScaleBorderRight;
+    convolve_block_buffer_stride = Align<ptrdiff_t>(
+        unaligned_convolve_buffer_stride * pixel_size, kMaxAlignment);
+    constexpr int convolve_buffer_height = kMaxScaledSuperBlockSizeInPixels +
+                                           kConvolveBorderLeftTop +
+                                           kConvolveBorderBottom;
+
+    convolve_block_buffer = MakeAlignedUniquePtr<uint8_t>(
+        kMaxAlignment, convolve_buffer_height * convolve_block_buffer_stride);
+#if LIBGAV1_MSAN
+    // Quiet msan warnings in ConvolveScale2D_NEON(). Set with random non-zero
+    // value to aid in future debugging.
+    memset(convolve_block_buffer.get(), 0x66,
+           convolve_buffer_height * convolve_block_buffer_stride);
+#endif
+
+    return convolve_block_buffer != nullptr;
+  }
+
+  // kCompoundPredictionTypeDiffWeighted prediction mode needs a mask of the
+  // prediction block size. This buffer is used to store that mask. The masks
+  // will be created for the Y plane and will be re-used for the U & V planes.
+  alignas(kMaxAlignment) uint8_t weight_mask[kMaxSuperBlockSizeSquareInPixels];
+
+  // For each instance of the TileScratchBuffer, only one of the following
+  // buffers will be used at any given time, so it is ok to share them in a
+  // union.
+  union {
+    // Buffers used for prediction process.
+    // Compound prediction calculations always output 16-bit values. Depending
+    // on the bitdepth the values may be treated as int16_t or uint16_t. See
+    // src/dsp/convolve.cc and src/dsp/warp.cc for explanations.
+    // Inter/intra calculations output Pixel values.
+    // These buffers always use width as the stride. This enables packing the
+    // values in and simplifies loads/stores for small values.
+
+    // 10/12 bit compound prediction and 10/12 bit inter/intra prediction.
+    alignas(kMaxAlignment) uint16_t
+        prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels];
+    // 8 bit compound prediction buffer.
+    alignas(kMaxAlignment) int16_t
+        compound_prediction_buffer_8bpp[2][kMaxSuperBlockSizeSquareInPixels];
+
+    // Union usage note: This is used only by functions in the "intra"
+    // prediction path.
+    //
+    // Buffer used for storing subsampled luma samples needed for CFL
+    // prediction. This buffer is used to avoid repetition of the subsampling
+    // for the V plane when it is already done for the U plane.
+    int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+  };
+
+  // Buffer used for convolve. The maximum size required for this buffer is:
+  //  maximum block height (with scaling and border) = 2 * 128 + 3 + 4 = 263.
+  //  maximum block stride (with scaling and border aligned to 16) =
+  //     (2 * 128 + 3 + 8 + 5) * pixel_size = 272 * pixel_size.
+  //  Where pixel_size is (bitdepth == 8) ? 1 : 2.
+  // Has an alignment of kMaxAlignment when allocated.
+  AlignedUniquePtr<uint8_t> convolve_block_buffer;
+  ptrdiff_t convolve_block_buffer_stride;
+
+  // Flag indicating whether the data in |cfl_luma_buffer| is valid.
+  bool cfl_luma_buffer_valid;
+
+  // Equivalent to BlockDecoded array in the spec. This stores the decoded
+  // state of every 4x4 block in a superblock. It has 1 row/column border on
+  // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the
+  // spec uses "-1" as an index to access the left and top borders. In the
+  // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So
+  // all accesses into this array will be offset by +1 when compared with the
+  // spec.
+  bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride];
+};
+
+class TileScratchBufferPool {
+ public:
+  void Reset(int bitdepth) {
+    if (bitdepth_ == bitdepth) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ == 8 && bitdepth != 8) {
+      // We are going from a pixel size of 1 to a pixel size of 2. So invalidate
+      // the stack.
+      std::lock_guard<std::mutex> lock(mutex_);
+      while (!buffers_.Empty()) {
+        buffers_.Pop();
+      }
+    }
+#endif
+    bitdepth_ = bitdepth;
+  }
+
+  std::unique_ptr<TileScratchBuffer> Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (buffers_.Empty()) {
+      std::unique_ptr<TileScratchBuffer> scratch_buffer(new (std::nothrow)
+                                                            TileScratchBuffer);
+      if (scratch_buffer == nullptr || !scratch_buffer->Init(bitdepth_)) {
+        return nullptr;
+      }
+      return scratch_buffer;
+    }
+    return buffers_.Pop();
+  }
+
+  void Release(std::unique_ptr<TileScratchBuffer> scratch_buffer) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffers_.Push(std::move(scratch_buffer));
+  }
+
+ private:
+  std::mutex mutex_;
+  // We will never need more than kMaxThreads scratch buffers since that is the
+  // maximum amount of work that will be done at any given time.
+  Stack<std::unique_ptr<TileScratchBuffer>, kMaxThreads> buffers_
+      LIBGAV1_GUARDED_BY(mutex_);
+  int bitdepth_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h
new file mode 100644 (file)
index 0000000..df2da9f
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+#define LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Exposes a 1D allocated memory buffer as a 2D array.
+template <typename T>
+class Array2DView {
+ public:
+  Array2DView() = default;
+  Array2DView(int rows, int columns, T* const data) {
+    Reset(rows, columns, data);
+  }
+
+  // Copyable and Movable.
+  Array2DView(const Array2DView& rhs) = default;
+  Array2DView& operator=(const Array2DView& rhs) = default;
+
+  void Reset(int rows, int columns, T* const data) {
+    rows_ = rows;
+    columns_ = columns;
+    data_ = data;
+  }
+
+  int rows() const { return rows_; }
+  int columns() const { return columns_; }
+
+  T* operator[](int row) { return const_cast<T*>(GetRow(row)); }
+
+  const T* operator[](int row) const { return GetRow(row); }
+
+ private:
+  const T* GetRow(int row) const {
+    assert(row < rows_);
+    const ptrdiff_t offset = static_cast<ptrdiff_t>(row) * columns_;
+    return data_ + offset;
+  }
+
+  int rows_ = 0;
+  int columns_ = 0;
+  T* data_ = nullptr;
+};
+
+// Allocates and owns the contiguous memory and exposes an Array2DView of
+// dimension |rows| x |columns|.
+template <typename T>
+class Array2D {
+ public:
+  Array2D() = default;
+
+  // Copyable and Movable.
+  Array2D(const Array2D& rhs) = default;
+  Array2D& operator=(const Array2D& rhs) = default;
+
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns,
+                                     bool zero_initialize = true) {
+    size_ = rows * columns;
+    // If T is not a trivial type, we should always reallocate the data_
+    // buffer, so that the destructors of any existing objects are invoked.
+    if (!std::is_trivial<T>::value || allocated_size_ < size_) {
+      // Note: This invokes the global operator new if T is a non-class type,
+      // such as integer or enum types, or a class type that is not derived
+      // from libgav1::Allocable, such as std::unique_ptr. If we enforce a
+      // maximum allocation size or keep track of our own heap memory
+      // consumption, we will need to handle the allocations here that use the
+      // global operator new.
+      if (zero_initialize) {
+        data_.reset(new (std::nothrow) T[size_]());
+      } else {
+        data_.reset(new (std::nothrow) T[size_]);
+      }
+      if (data_ == nullptr) {
+        allocated_size_ = 0;
+        return false;
+      }
+      allocated_size_ = size_;
+    } else if (zero_initialize) {
+      // Cast the data_ pointer to void* to avoid the GCC -Wclass-memaccess
+      // warning. The memset is safe because T is a trivial type.
+      void* dest = data_.get();
+      memset(dest, 0, sizeof(T) * size_);
+    }
+    data_view_.Reset(rows, columns, data_.get());
+    return true;
+  }
+
+  int rows() const { return data_view_.rows(); }
+  int columns() const { return data_view_.columns(); }
+  size_t size() const { return size_; }
+  T* data() { return data_.get(); }
+  const T* data() const { return data_.get(); }
+
+  T* operator[](int row) { return data_view_[row]; }
+
+  const T* operator[](int row) const { return data_view_[row]; }
+
+ private:
+  std::unique_ptr<T[]> data_;
+  size_t allocated_size_ = 0;
+  size_t size_ = 0;
+  Array2DView<T> data_view_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_ARRAY_2D_H_
diff --git a/src/utils/array_2d_test.cc b/src/utils/array_2d_test.cc
new file mode 100644 (file)
index 0000000..0535274
--- /dev/null
@@ -0,0 +1,248 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/array_2d.h"
+
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kRows = 50;
+constexpr int kColumns = 200;
+
+TEST(Array2dViewTest, TestUint8) {
+  uint8_t data[kRows * kColumns] = {};
+  Array2DView<uint8_t> data2d(kRows, kColumns, data);
+
+  // Verify data.
+  data[kColumns] = 100;
+  data[kColumns + 1] = 101;
+  data[kColumns * 2 + 10] = 210;
+  data[kColumns * 2 + 40] = 240;
+  EXPECT_EQ(data2d[1][0], 100);
+  EXPECT_EQ(data2d[1][1], 101);
+  EXPECT_EQ(data2d[2][10], 210);
+  EXPECT_EQ(data2d[2][40], 240);
+
+  // Verify pointers.
+  EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dViewTest, TestUint16) {
+  uint16_t data[kRows * kColumns] = {};
+  Array2DView<uint16_t> data2d(kRows, kColumns, data);
+
+  // Verify data.
+  data[kColumns] = 100;
+  data[kColumns + 1] = 101;
+  data[kColumns * 2 + 10] = 210;
+  data[kColumns * 2 + 40] = 240;
+  EXPECT_EQ(data2d[1][0], 100);
+  EXPECT_EQ(data2d[1][1], 101);
+  EXPECT_EQ(data2d[2][10], 210);
+  EXPECT_EQ(data2d[2][40], 240);
+
+  // Verify pointers.
+  EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dViewTest, TestUint8Const) {
+  uint8_t data[kRows * kColumns] = {};
+  // Declared as const to provide a read-only view of |data|.
+  const Array2DView<uint8_t> data2d(kRows, kColumns, data);
+
+  // Verify data.
+  data[kColumns] = 100;
+  data[kColumns + 1] = 101;
+  data[kColumns * 2 + 10] = 210;
+  data[kColumns * 2 + 40] = 240;
+  EXPECT_EQ(data2d[1][0], 100);
+  EXPECT_EQ(data2d[1][1], 101);
+  EXPECT_EQ(data2d[2][10], 210);
+  EXPECT_EQ(data2d[2][40], 240);
+
+  // Verify pointers.
+  EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dTest, TestUint8) {
+  Array2D<uint8_t> data2d;
+  ASSERT_TRUE(data2d.Reset(kRows, kColumns, true));
+
+  EXPECT_EQ(data2d.rows(), kRows);
+  EXPECT_EQ(data2d.columns(), kColumns);
+
+  // Verify pointers.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_NE(data2d[i], nullptr);
+  }
+
+  // Verify data (must be zero initialized).
+  for (int i = 0; i < kRows; ++i) {
+    for (int j = 0; j < kColumns; ++j) {
+      EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+    }
+  }
+
+  // Reset to a 2d array of smaller size with zero_initialize == false.
+  data2d[0][0] = 10;
+  ASSERT_TRUE(data2d.Reset(kRows - 1, kColumns - 1, false));
+
+  EXPECT_EQ(data2d.rows(), kRows - 1);
+  EXPECT_EQ(data2d.columns(), kColumns - 1);
+
+  // Verify pointers.
+  for (int i = 0; i < kRows - 1; ++i) {
+    EXPECT_NE(data2d[i], nullptr);
+  }
+
+  // Verify data (must be zero except for 0,0 because it was zero initialized in
+  // the previous call to Reset).
+  for (int i = 0; i < kRows - 1; ++i) {
+    for (int j = 0; j < kColumns - 1; ++j) {
+      if (i == 0 && j == 0) {
+        EXPECT_EQ(data2d[i][j], 10) << "Mismatch in [" << i << "][" << j << "]";
+      } else {
+        EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+      }
+    }
+  }
+
+  // Reset to a 2d array of smaller size with zero_initialize == true.
+  ASSERT_TRUE(data2d.Reset(kRows - 2, kColumns - 2, true));
+
+  EXPECT_EQ(data2d.rows(), kRows - 2);
+  EXPECT_EQ(data2d.columns(), kColumns - 2);
+
+  // Verify pointers.
+  for (int i = 0; i < kRows - 2; ++i) {
+    EXPECT_NE(data2d[i], nullptr);
+  }
+
+  // Verify data (must be zero initialized).
+  for (int i = 0; i < kRows - 2; ++i) {
+    for (int j = 0; j < kColumns - 2; ++j) {
+      EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+    }
+  }
+}
+
+TEST(Array2dTest, TestUniquePtr1) {
+  // A simple class that sets an int value to 0 in the destructor.
+  class Cleaner {
+   public:
+    explicit Cleaner(int* value) : value_(value) {}
+    ~Cleaner() { *value_ = 0; }
+
+   private:
+    int* value_;
+  };
+  int value = 100;
+  Array2D<std::unique_ptr<Cleaner>> data2d;
+  ASSERT_TRUE(data2d.Reset(4, 4, true));
+  data2d[0][0].reset(new (std::nothrow) Cleaner(&value));
+  EXPECT_EQ(value, 100);
+  // Reset to a smaller size. Depending on the implementation, the data_ buffer
+  // may or may not be reused.
+  ASSERT_TRUE(data2d.Reset(2, 2, true));
+  // Reset to a much larger size. The data_ buffer will be reallocated.
+  ASSERT_TRUE(data2d.Reset(32, 32, true));
+  // The destructors of all elements in the former data_ buffer should have
+  // been invoked.
+  EXPECT_EQ(value, 0);
+}
+
+TEST(Array2dTest, TestUniquePtr2) {
+  // A simple class that sets an int value to 0 in the destructor.
+  class Cleaner {
+   public:
+    explicit Cleaner(int* value) : value_(value) {}
+    ~Cleaner() { *value_ = 0; }
+
+   private:
+    int* value_;
+  };
+  int value1 = 100;
+  int value2 = 200;
+  Array2D<std::unique_ptr<Cleaner>> data2d;
+  ASSERT_TRUE(data2d.Reset(4, 4, false));
+  data2d[0][0].reset(new (std::nothrow) Cleaner(&value1));
+  data2d[3][3].reset(new (std::nothrow) Cleaner(&value2));
+  EXPECT_EQ(value1, 100);
+  EXPECT_EQ(value2, 200);
+  // Reset to a smaller size. Whether or not the data_ buffer is reused, the
+  // destructors of all existing elements should be invoked.
+  ASSERT_TRUE(data2d.Reset(2, 2, false));
+  EXPECT_EQ(value1, 0);
+  EXPECT_EQ(value2, 0);
+}
+
+// Shows that std::is_standard_layout is not relevant to the default
+// initialization vs. value initialization issue, but std::is_trivial is.
+TEST(Array2dTest, TestStructInit) {
+  // Make one data member private so that this struct does not have a standard
+  // layout. This also makes the struct not a POD type.
+  struct Point {
+    int x;
+    int Y() const { return y; }
+
+   private:
+    int y;
+  };
+
+  EXPECT_TRUE(std::is_trivial<Point>::value);
+  EXPECT_FALSE(std::is_standard_layout<Point>::value);
+
+  // The Point structs in this array are default initialized.
+  Array2D<Point> data2d_default_init;
+  ASSERT_TRUE(data2d_default_init.Reset(kRows, kColumns, false));
+  // The Point structs in this array are value initialized (i.e., zero
+  // initialized).
+  Array2D<Point> data2d;
+  ASSERT_TRUE(data2d.Reset(kRows, kColumns, true));
+
+#if LIBGAV1_MSAN
+  // Use MemorySanitizer to check Reset(rows, columns, false) does not
+  // initialize the memory while Reset(rows, columns, true) does.
+  //
+  // __msan_test_shadow(const void *x, uptr size) returns the offset of the
+  // first (at least partially) poisoned byte in the range, or -1 if the whole
+  // range is good.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_EQ(__msan_test_shadow(data2d_default_init[i],
+                                 sizeof(data2d_default_init[0][0]) * kColumns),
+              0);
+    EXPECT_EQ(__msan_test_shadow(data2d[i], sizeof(data2d[0][0]) * kColumns),
+              -1);
+    for (int j = 0; j < kColumns; ++j) {
+      EXPECT_EQ(data2d[i][j].x, 0);
+      EXPECT_EQ(data2d[i][j].Y(), 0);
+    }
+  }
+#endif
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/bit_mask_set.h b/src/utils/bit_mask_set.h
new file mode 100644 (file)
index 0000000..7371753
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+#define LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// This class is used to check if a given value is equal to one of the several
+// predetermined values using a bit mask instead of a chain of comparisons and
+// ||s. This usually results in fewer instructions.
+//
+// Usage:
+//   constexpr BitMaskSet set(value1, value2);
+//   set.Contains(value1) => returns true.
+//   set.Contains(value3) => returns false.
+class BitMaskSet {
+ public:
+  explicit constexpr BitMaskSet(uint32_t mask) : mask_(mask) {}
+
+  constexpr BitMaskSet(int v1, int v2) : mask_((1U << v1) | (1U << v2)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8, int v9)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8, int v9, int v10)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9) | (1U << v10)) {
+  }
+
+  constexpr bool Contains(uint8_t value) const {
+    return MaskContainsValue(mask_, value);
+  }
+
+  static constexpr bool MaskContainsValue(uint32_t mask, uint8_t value) {
+    return ((mask >> value) & 1) != 0;
+  }
+
+ private:
+  const uint32_t mask_;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
diff --git a/src/utils/bit_reader.cc b/src/utils/bit_reader.cc
new file mode 100644 (file)
index 0000000..3234128
--- /dev/null
@@ -0,0 +1,117 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/bit_reader.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+bool Assign(int* const value, int assignment, bool return_value) {
+  *value = assignment;
+  return return_value;
+}
+
+// 5.9.29.
+int InverseRecenter(int r, int v) {
+  if (v > (r << 1)) {
+    return v;
+  }
+  if ((v & 1) != 0) {
+    return r - ((v + 1) >> 1);
+  }
+  return r + (v >> 1);
+}
+
+}  // namespace
+
+bool BitReader::DecodeSignedSubexpWithReference(int low, int high,
+                                                int reference, int control,
+                                                int* const value) {
+  if (!DecodeUnsignedSubexpWithReference(high - low, reference - low, control,
+                                         value)) {
+    return false;
+  }
+  *value += low;
+  return true;
+}
+
+bool BitReader::DecodeUniform(int n, int* const value) {
+  if (n <= 1) {
+    return Assign(value, 0, true);
+  }
+  const int w = FloorLog2(n) + 1;
+  const int m = (1 << w) - n;
+  assert(w - 1 < 32);
+  const int v = static_cast<int>(ReadLiteral(w - 1));
+  if (v == -1) {
+    return Assign(value, 0, false);
+  }
+  if (v < m) {
+    return Assign(value, v, true);
+  }
+  const int extra_bit = ReadBit();
+  if (extra_bit == -1) {
+    return Assign(value, 0, false);
+  }
+  return Assign(value, (v << 1) - m + extra_bit, true);
+}
+
+bool BitReader::DecodeUnsignedSubexpWithReference(int mx, int reference,
+                                                  int control,
+                                                  int* const value) {
+  int v;
+  if (!DecodeSubexp(mx, control, &v)) return false;
+  if ((reference << 1) <= mx) {
+    *value = InverseRecenter(reference, v);
+  } else {
+    *value = mx - 1 - InverseRecenter(mx - 1 - reference, v);
+  }
+  return true;
+}
+
+bool BitReader::DecodeSubexp(int num_symbols, int control, int* const value) {
+  int i = 0;
+  int mk = 0;
+  while (true) {
+    const int b = (i != 0) ? control + i - 1 : control;
+    if (b >= 32) {
+      return Assign(value, 0, false);
+    }
+    const int a = 1 << b;
+    if (num_symbols <= mk + 3 * a) {
+      if (!DecodeUniform(num_symbols - mk, value)) return false;
+      *value += mk;
+      return true;
+    }
+    const int8_t subexp_more_bits = ReadBit();
+    if (subexp_more_bits == -1) return false;
+    if (subexp_more_bits != 0) {
+      ++i;
+      mk += a;
+    } else {
+      const int subexp_bits = static_cast<int>(ReadLiteral(b));
+      if (subexp_bits == -1) {
+        return Assign(value, 0, false);
+      }
+      return Assign(value, subexp_bits + mk, true);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h
new file mode 100644 (file)
index 0000000..5a10e12
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_BIT_READER_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+class BitReader {
+ public:
+  virtual ~BitReader() = default;
+
+  virtual int ReadBit() = 0;
+  // |num_bits| has to be <= 32. The function returns a value in the range [0,
+  // 2^num_bits - 1] (inclusive) on success and -1 on failure.
+  virtual int64_t ReadLiteral(int num_bits) = 0;
+
+  bool DecodeSignedSubexpWithReference(int low, int high, int reference,
+                                       int control, int* value);  // 5.9.26.
+  // Decodes a nonnegative integer with maximum number of values |n| (i.e.,
+  // output in range 0..n-1) by following the process specified in Section
+  // 4.10.7 ns(n) and Section 4.10.10 NS(n) of the spec.
+  bool DecodeUniform(int n, int* value);
+
+ private:
+  // Helper functions for DecodeSignedSubexpWithReference.
+  bool DecodeUnsignedSubexpWithReference(int mx, int reference, int control,
+                                         int* value);           // 5.9.27.
+  bool DecodeSubexp(int num_symbols, int control, int* value);  // 5.9.28.
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BIT_READER_H_
diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc
new file mode 100644 (file)
index 0000000..3bb9f1e
--- /dev/null
@@ -0,0 +1,83 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include <algorithm>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) {
+  rows4x4_ = rows4x4;
+  columns4x4_ = columns4x4;
+  index_ = 0;
+  return block_parameters_cache_.Reset(rows4x4_, columns4x4_) &&
+         block_parameters_.Resize(rows4x4_ * columns4x4_);
+}
+
+BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4,
+                                            BlockSize block_size) {
+  const size_t index = index_.fetch_add(1, std::memory_order_relaxed);
+  if (index >= block_parameters_.size()) return nullptr;
+  auto& bp = block_parameters_.get()[index];
+  if (bp == nullptr) {
+    bp.reset(new (std::nothrow) BlockParameters);
+    if (bp == nullptr) return nullptr;
+  }
+  FillCache(row4x4, column4x4, block_size, bp.get());
+  return bp.get();
+}
+
+void BlockParametersHolder::FillCache(int row4x4, int column4x4,
+                                      BlockSize block_size,
+                                      BlockParameters* const bp) {
+  int rows = std::min(static_cast<int>(kNum4x4BlocksHigh[block_size]),
+                      rows4x4_ - row4x4);
+  const int columns = std::min(static_cast<int>(kNum4x4BlocksWide[block_size]),
+                               columns4x4_ - column4x4);
+  auto* bp_dst = &block_parameters_cache_[row4x4][column4x4];
+  // Specialize columns cases (values in kNum4x4BlocksWide[]) for better
+  // performance.
+  if (columns == 1) {
+    SetBlock<BlockParameters*>(rows, 1, bp, bp_dst, columns4x4_);
+  } else if (columns == 2) {
+    SetBlock<BlockParameters*>(rows, 2, bp, bp_dst, columns4x4_);
+  } else if (columns == 4) {
+    SetBlock<BlockParameters*>(rows, 4, bp, bp_dst, columns4x4_);
+  } else if (columns == 8) {
+    SetBlock<BlockParameters*>(rows, 8, bp, bp_dst, columns4x4_);
+  } else if (columns == 16) {
+    SetBlock<BlockParameters*>(rows, 16, bp, bp_dst, columns4x4_);
+  } else if (columns == 32) {
+    SetBlock<BlockParameters*>(rows, 32, bp, bp_dst, columns4x4_);
+  } else {
+    do {
+      // The following loop has better performance than using std::fill().
+      // std::fill() has some overhead in checking zero loop count.
+      int x = columns;
+      auto* d = bp_dst;
+      do {
+        *d++ = bp;
+      } while (--x != 0);
+      bp_dst += columns4x4_;
+    } while (--rows != 0);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h
new file mode 100644 (file)
index 0000000..ca36907
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+
+#include <atomic>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Holds the BlockParameters pointers to each 4x4 block in the frame.
+class BlockParametersHolder {
+ public:
+  BlockParametersHolder() = default;
+
+  // Not copyable or movable.
+  BlockParametersHolder(const BlockParametersHolder&) = delete;
+  BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
+
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4);
+
+  // Returns a pointer to a BlockParameters object that can be used safely until
+  // the next call to Reset(). Returns nullptr on memory allocation failure. It
+  // also fills the cache matrix for the block starting at |row4x4|, |column4x4|
+  // of size |block_size| with the returned pointer.
+  BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size);
+
+  // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
+  // is done as a simple look up of the |block_parameters_cache_| matrix.
+  // Returns nullptr if the BlockParameters cannot be found.
+  BlockParameters* Find(int row4x4, int column4x4) const {
+    return block_parameters_cache_[row4x4][column4x4];
+  }
+
+  BlockParameters** Address(int row4x4, int column4x4) {
+    return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+  }
+
+  BlockParameters* const* Address(int row4x4, int column4x4) const {
+    return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+  }
+
+  int columns4x4() const { return columns4x4_; }
+
+ private:
+  // Needs access to FillCache for testing Cdef.
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterApplyCdefTest;
+
+  void FillCache(int row4x4, int column4x4, BlockSize block_size,
+                 BlockParameters* bp);
+
+  int rows4x4_ = 0;
+  int columns4x4_ = 0;
+
+  // Owns the memory of BlockParameters pointers for the entire frame. It can
+  // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated
+  // on demand and re-used across frames.
+  DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_;
+
+  // Points to the next available index of |block_parameters_|.
+  std::atomic<int> index_;
+
+  // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
+  // FillCache() and used by Find() to perform look ups using exactly one look
+  // up (instead of traversing the entire tree).
+  Array2D<BlockParameters*> block_parameters_cache_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
diff --git a/src/utils/block_parameters_holder_test.cc b/src/utils/block_parameters_holder_test.cc
new file mode 100644 (file)
index 0000000..212eba5
--- /dev/null
@@ -0,0 +1,76 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(BlockParametersHolder, TestBasic) {
+  BlockParametersHolder holder;
+  ASSERT_TRUE(holder.Reset(20, 20));
+
+  // Get a BlockParameters object.
+  BlockParameters* const bp1 = holder.Get(10, 10, kBlock32x32);
+  ASSERT_NE(bp1, nullptr);
+  // Ensure that cache was filled appropriately. From (10, 10) to (17, 17)
+  // should be bp1 (10 + 4x4 width/height of 32x32 block is 18).
+  for (int i = 10; i < 18; ++i) {
+    for (int j = 10; j < 18; ++j) {
+      EXPECT_EQ(holder.Find(i, j), bp1)
+          << "Mismatch in (" << i << ", " << j << ")";
+    }
+  }
+
+  // Get the maximum number of BlockParameters objects.
+  for (int i = 0; i < 399; ++i) {
+    EXPECT_NE(holder.Get(10, 10, kBlock32x32), nullptr)
+        << "Mismatch in index " << i;
+  }
+
+  // Get() should now return nullptr since there are no more BlockParameters
+  // objects available.
+  EXPECT_EQ(holder.Get(10, 10, kBlock32x32), nullptr);
+
+  // Reset the holder to the same size.
+  ASSERT_TRUE(holder.Reset(20, 20));
+
+  // Get a BlockParameters object. This should be the same as bp1 since the
+  // holder was Reset to the same size.
+  BlockParameters* const bp2 = holder.Get(10, 10, kBlock32x32);
+  EXPECT_EQ(bp2, bp1);
+
+  // Reset the holder to a smaller size.
+  ASSERT_TRUE(holder.Reset(20, 10));
+
+  // Get a BlockParameters object. This should be the same as bp1 since the
+  // holder was Reset to a smaller size.
+  BlockParameters* const bp3 = holder.Get(0, 0, kBlock32x32);
+  EXPECT_EQ(bp3, bp1);
+
+  // Reset the holder to a larger size.
+  ASSERT_TRUE(holder.Reset(30, 30));
+
+  // Get a BlockParameters object. This may or may not be the same as bp1 since
+  // the holder was Reset to a larger size.
+  BlockParameters* const bp4 = holder.Get(0, 0, kBlock32x32);
+  EXPECT_NE(bp4, nullptr);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/blocking_counter.h b/src/utils/blocking_counter.h
new file mode 100644 (file)
index 0000000..6d664f8
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+#define LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+
+#include <cassert>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <mutex>               // NOLINT (unapproved c++11 header)
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Implementation of a Blocking Counter that is used for the "fork-join"
+// use case. Typical usage would be as follows:
+//   BlockingCounter counter(num_jobs);
+//     - spawn the jobs.
+//     - call counter.Wait() on the master thread.
+//     - worker threads will call counter.Decrement().
+//     - master thread will return from counter.Wait() when all workers are
+//     complete.
+template <bool has_failure_status>
+class BlockingCounterImpl {
+ public:
+  explicit BlockingCounterImpl(int initial_count)
+      : count_(initial_count), job_failed_(false) {}
+
+  // Increment the counter by |count|. This must be called before Wait() is
+  // called. This must be called from the same thread that will call Wait().
+  void IncrementBy(int count) {
+    assert(count >= 0);
+    std::unique_lock<std::mutex> lock(mutex_);
+    count_ += count;
+  }
+
+  // Decrement the counter by 1. This function can be called only when
+  // |has_failure_status| is false (i.e.) when this class is being used with the
+  // |BlockingCounter| alias.
+  void Decrement() {
+    static_assert(!has_failure_status, "");
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (--count_ == 0) {
+      condition_.notify_one();
+    }
+  }
+
+  // Decrement the counter by 1. This function can be called only when
+  // |has_failure_status| is true (i.e.) when this class is being used with the
+  // |BlockingCounterWithStatus| alias. |job_succeeded| is used to update the
+  // state of |job_failed_|.
+  void Decrement(bool job_succeeded) {
+    static_assert(has_failure_status, "");
+    std::unique_lock<std::mutex> lock(mutex_);
+    job_failed_ |= !job_succeeded;
+    if (--count_ == 0) {
+      condition_.notify_one();
+    }
+  }
+
+  // Block until the counter becomes 0. This function can be called only once
+  // per object. If |has_failure_status| is true, true is returned if all the
+  // jobs succeeded and false is returned if any of the jobs failed. If
+  // |has_failure_status| is false, this function always returns true.
+  bool Wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    condition_.wait(lock, [this]() { return count_ == 0; });
+    // If |has_failure_status| is false, we simply return true.
+    return has_failure_status ? !job_failed_ : true;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  int count_ LIBGAV1_GUARDED_BY(mutex_);
+  bool job_failed_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+using BlockingCounterWithStatus = BlockingCounterImpl<true>;
+using BlockingCounter = BlockingCounterImpl<false>;
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
diff --git a/src/utils/blocking_counter_test.cc b/src/utils/blocking_counter_test.cc
new file mode 100644 (file)
index 0000000..1b6e7f5
--- /dev/null
@@ -0,0 +1,127 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/blocking_counter.h"
+
+#include <array>
+#include <memory>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kNumWorkers = 10;
+constexpr int kNumJobs = 20;
+
+TEST(BlockingCounterTest, BasicFunctionality) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounter counter(kNumJobs);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement();
+    });
+  }
+
+  // Wait for the jobs to complete. This should always return true.
+  ASSERT_TRUE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+TEST(BlockingCounterTest, IncrementBy) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounter counter(0);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    counter.IncrementBy(1);
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement();
+    });
+  }
+
+  // Wait for the jobs to complete. This should always return true.
+  ASSERT_TRUE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+TEST(BlockingCounterWithStatusTest, BasicFunctionality) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounterWithStatus counter(kNumJobs);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement(true);
+    });
+  }
+
+  // Wait for the jobs to complete. This should return true since all the jobs
+  // reported |job_succeeded| as true.
+  ASSERT_TRUE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+TEST(BlockingCounterWithStatusTest, BasicFunctionalityWithStatus) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounterWithStatus counter(kNumJobs);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement(i != 10);
+    });
+  }
+
+  // Wait for the jobs to complete. This should return false since one of the
+  // jobs reported |job_succeeded| as false.
+  ASSERT_FALSE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/common.h b/src/utils/common.h
new file mode 100644 (file)
index 0000000..f75ace8
--- /dev/null
@@ -0,0 +1,555 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMMON_H_
+#define LIBGAV1_SRC_UTILS_COMMON_H_
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#if defined(_M_X64) || defined(_M_ARM64)
+#pragma intrinsic(_BitScanReverse64)
+#define HAVE_BITSCANREVERSE64
+#endif  // defined(_M_X64) || defined(_M_ARM64)
+#endif  // defined(_MSC_VER)
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// LIBGAV1_RESTRICT
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
+// Note a template alias is not used for compatibility with older compilers
+// (e.g., gcc < 10) that do not expand the type when instantiating a template
+// function, either explicitly or in an assignment to a function pointer as is
+// done within the dsp code. RestrictPtr<T>::type is an alternative to this,
+// similar to std::add_const, but for conciseness the macro is preferred.
+#ifdef __GNUC__
+#define LIBGAV1_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define LIBGAV1_RESTRICT __restrict
+#else
+#define LIBGAV1_RESTRICT
+#endif
+
+// Aligns |value| to the desired |alignment|. |alignment| must be a power of 2.
+template <typename T>
+inline T Align(T value, T alignment) {
+  assert(alignment != 0);
+  const T alignment_mask = alignment - 1;
+  return (value + alignment_mask) & ~alignment_mask;
+}
+
+// Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2.
+inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) {
+  const auto value = reinterpret_cast<uintptr_t>(addr);
+  return reinterpret_cast<uint8_t*>(Align(value, alignment));
+}
+
+inline int32_t Clip3(int32_t value, int32_t low, int32_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+template <typename Pixel>
+void ExtendLine(void* const line_start, const int width, const int left,
+                const int right) {
+  auto* const start = static_cast<Pixel*>(line_start);
+  const Pixel* src = start;
+  Pixel* dst = start - left;
+  // Copy to left and right borders.
+  Memset(dst, src[0], left);
+  Memset(dst + left + width, src[width - 1], right);
+}
+
+// The following 2 templates set a block of data with uncontiguous memory to
+// |value|. The compilers usually generate several branches to handle different
+// cases of |columns| when inlining memset() and std::fill(), and these branches
+// are unfortunately within the loop of |rows|. So calling these templates
+// directly could be inefficient. It is recommended to specialize common cases
+// of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before
+// processing the generic case of |columns|. The code size may be larger, but
+// there would be big speed gains.
+// Call template MemSetBlock<> when sizeof(|T|) is 1.
+// Call template SetBlock<> when sizeof(|T|) is larger than 1.
+template <typename T>
+void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  static_assert(sizeof(T) == 1, "");
+  do {
+    memset(dst, value, columns);
+    dst += stride;
+  } while (--rows != 0);
+}
+
+template <typename T>
+void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  do {
+    std::fill(dst, dst + columns, value);
+    dst += stride;
+  } while (--rows != 0);
+}
+
+#if defined(__GNUC__)
+
+inline int CountLeadingZeros(uint32_t n) {
+  assert(n != 0);
+  return __builtin_clz(n);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+  assert(n != 0);
+  return __builtin_clzll(n);
+}
+
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
+#elif defined(_MSC_VER)
+
+inline int CountLeadingZeros(uint32_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  const unsigned char bit_set = _BitScanReverse(&first_set_bit, n);
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return 31 ^ static_cast<int>(first_set_bit);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+#if defined(HAVE_BITSCANREVERSE64)
+  const unsigned char bit_set =
+      _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
+#else   // !defined(HAVE_BITSCANREVERSE64)
+  const auto n_hi = static_cast<unsigned long>(n >> 32);  // NOLINT(runtime/int)
+  if (n_hi != 0) {
+    const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
+    assert(bit_set != 0);
+    static_cast<void>(bit_set);
+    return 31 ^ static_cast<int>(first_set_bit);
+  }
+  const unsigned char bit_set = _BitScanReverse(
+      &first_set_bit, static_cast<unsigned long>(n));  // NOLINT(runtime/int)
+#endif  // defined(HAVE_BITSCANREVERSE64)
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return 63 ^ static_cast<int>(first_set_bit);
+}
+
+#undef HAVE_BITSCANREVERSE64
+
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  const unsigned char bit_set = _BitScanForward(&first_set_bit, n);
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return static_cast<int>(first_set_bit);
+}
+
+#else  // !defined(__GNUC__) && !defined(_MSC_VER)
+
+template <const int kMSB, typename T>
+inline int CountLeadingZeros(T n) {
+  assert(n != 0);
+  const T msb = T{1} << kMSB;
+  int count = 0;
+  while ((n & msb) == 0) {
+    ++count;
+    n <<= 1;
+  }
+  return count;
+}
+
+inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); }
+
+inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); }
+
+// This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second
+// Edition, page 109. The book says:
+//   If the number of trailing 0's is expected to be small or large, then the
+//   simple loops shown in Figure 5-23 are quite fast.
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  // Create a word with 1's at the positions of the trailing 0's in |n|, and
+  // 0's elsewhere (e.g., 01011000 => 00000111).
+  n = ~n & (n - 1);
+  int count = 0;
+  while (n != 0) {
+    ++count;
+    n >>= 1;
+  }
+  return count;
+}
+
+#endif  // defined(__GNUC__)
+
+inline int FloorLog2(int32_t n) {
+  assert(n > 0);
+  return 31 ^ CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+inline int FloorLog2(uint32_t n) {
+  assert(n > 0);
+  return 31 ^ CountLeadingZeros(n);
+}
+
+inline int FloorLog2(int64_t n) {
+  assert(n > 0);
+  return 63 ^ CountLeadingZeros(static_cast<uint64_t>(n));
+}
+
+inline int FloorLog2(uint64_t n) {
+  assert(n > 0);
+  return 63 ^ CountLeadingZeros(n);
+}
+
+inline int CeilLog2(unsigned int n) {
+  // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but
+  // also for n == 1, so this expression must be guarded by the n < 2 test. An
+  // alternative implementation is:
+  // return (n == 0) ? 0 : FloorLog2(n) + static_cast<int>((n & (n - 1)) != 0);
+  return (n < 2) ? 0 : FloorLog2(n - 1) + 1;
+}
+
+inline int RightShiftWithCeiling(int value, int bits) {
+  assert(bits > 0);
+  return (value + (1 << bits) - 1) >> bits;
+}
+
+inline int32_t RightShiftWithRounding(int32_t value, int bits) {
+  assert(bits >= 0);
+  return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+inline uint32_t RightShiftWithRounding(uint32_t value, int bits) {
+  assert(bits >= 0);
+  return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRounding(int64_t value, int bits) {
+  assert(bits >= 0);
+  return static_cast<int32_t>((value + ((int64_t{1} << bits) >> 1)) >> bits);
+}
+
+inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) {
+  assert(bits > 0);
+  // The next line is equivalent to:
+  // return (value >= 0) ? RightShiftWithRounding(value, bits)
+  //                     : -RightShiftWithRounding(-value, bits);
+  return RightShiftWithRounding(value + (value >> 31), bits);
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) {
+  assert(bits > 0);
+  // The next line is equivalent to:
+  // return (value >= 0) ? RightShiftWithRounding(value, bits)
+  //                     : -RightShiftWithRounding(-value, bits);
+  return RightShiftWithRounding(value + (value >> 63), bits);
+}
+
+constexpr int DivideBy2(int n) { return n >> 1; }
+constexpr int DivideBy4(int n) { return n >> 2; }
+constexpr int DivideBy8(int n) { return n >> 3; }
+constexpr int DivideBy16(int n) { return n >> 4; }
+constexpr int DivideBy32(int n) { return n >> 5; }
+constexpr int DivideBy64(int n) { return n >> 6; }
+constexpr int DivideBy128(int n) { return n >> 7; }
+
+// Convert |value| to unsigned before shifting to avoid undefined behavior with
+// negative values.
+inline int LeftShift(int value, int bits) {
+  assert(bits >= 0);
+  assert(value >= -(int64_t{1} << (31 - bits)));
+  assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0));
+  return static_cast<int>(static_cast<uint32_t>(value) << bits);
+}
+inline int MultiplyBy2(int n) { return LeftShift(n, 1); }
+inline int MultiplyBy4(int n) { return LeftShift(n, 2); }
+inline int MultiplyBy8(int n) { return LeftShift(n, 3); }
+inline int MultiplyBy16(int n) { return LeftShift(n, 4); }
+inline int MultiplyBy32(int n) { return LeftShift(n, 5); }
+inline int MultiplyBy64(int n) { return LeftShift(n, 6); }
+
+constexpr int Mod32(int n) { return n & 0x1f; }
+constexpr int Mod64(int n) { return n & 0x3f; }
+
+//------------------------------------------------------------------------------
+// Bitstream functions
+
+constexpr bool IsIntraFrame(FrameType type) {
+  return type == kFrameKey || type == kFrameIntraOnly;
+}
+
+inline TransformClass GetTransformClass(TransformType tx_type) {
+  constexpr BitMaskSet kTransformClassVerticalMask(
+      kTransformTypeIdentityDct, kTransformTypeIdentityAdst,
+      kTransformTypeIdentityFlipadst);
+  if (kTransformClassVerticalMask.Contains(tx_type)) {
+    return kTransformClassVertical;
+  }
+  constexpr BitMaskSet kTransformClassHorizontalMask(
+      kTransformTypeDctIdentity, kTransformTypeAdstIdentity,
+      kTransformTypeFlipadstIdentity);
+  if (kTransformClassHorizontalMask.Contains(tx_type)) {
+    return kTransformClassHorizontal;
+  }
+  return kTransformClass2D;
+}
+
+inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane,
+                                 int8_t subsampling) {
+  return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling);
+}
+
+constexpr PlaneType GetPlaneType(Plane plane) {
+  return static_cast<PlaneType>(plane != kPlaneY);
+}
+
+// 5.11.44.
+constexpr bool IsDirectionalMode(PredictionMode mode) {
+  return mode >= kPredictionModeVertical && mode <= kPredictionModeD67;
+}
+
+// 5.9.3.
+//
+// |a| and |b| are order hints, treated as unsigned order_hint_bits-bit
+// integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32.
+// order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a
+// value between 24 and 31 (inclusive).
+//
+// If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the
+// result is zero. If |order_hint_shift_bits| is not zero, returns the
+// signed difference |a| - |b| using "modular arithmetic". More precisely, the
+// signed difference |a| - |b| is treated as a signed order_hint_bits-bit
+// integer and cast to an int. The returned difference is between
+// -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1
+// (inclusive).
+//
+// NOTE: |a| and |b| are the order_hint_bits least significant bits of the
+// actual values. This function returns the signed difference between the
+// actual values. The returned difference is correct as long as the actual
+// values are not more than 1 << (order_hint_bits - 1) - 1 apart.
+//
+// Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits|
+// is 28. Then |a| and |b| are in the range [0, 15], and the actual values for
+// |a| and |b| must not be more than 7 apart. (If the actual values for |a| and
+// |b| are exactly 8 apart, this function cannot tell whether the actual value
+// for |a| is before or after the actual value for |b|.)
+//
+// First, consider the order hints 2 and 6. For this simple case, we have
+//   GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and
+//   GetRelativeDistance(6, 2, 28) = 6 - 2 = 4.
+//
+// On the other hand, consider the order hints 2 and 14. The order hints are
+// 12 (> 7) apart, so we need to use the actual values instead. The actual
+// values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore
+// we have
+//   GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and
+//   GetRelativeDistance(14, 2, 28) = 30 - 34 = -4.
+//
+// The following comments apply only to specific CPUs' SIMD implementations,
+// such as intrinsics code.
+// For the 2 shift operations in this function, if the SIMD packed data is
+// 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to
+// shift; If the SIMD packed data is 8-bit wide, try to use
+// |order_hint_shift_bits| - 24 as as the number of bits to shift.
+// |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or
+// -24. In these cases diff is 0, and the behavior of left or right shifting -16
+// or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions,
+// and the result of shifting 0 is still 0. There is no guarantee that this
+// behavior and result apply to other CPUs' SIMD instructions.
+inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
+                               const unsigned int order_hint_shift_bits) {
+  const int diff = static_cast<int>(a) - static_cast<int>(b);
+  assert(order_hint_shift_bits <= 31);
+  if (order_hint_shift_bits == 0) {
+    assert(a == 0);
+    assert(b == 0);
+  } else {
+    assert(order_hint_shift_bits >= 24);  // i.e., order_hint_bits <= 8
+    assert(a < (1u << (32 - order_hint_shift_bits)));
+    assert(b < (1u << (32 - order_hint_shift_bits)));
+    assert(diff < (1 << (32 - order_hint_shift_bits)));
+    assert(diff >= -(1 << (32 - order_hint_shift_bits)));
+  }
+  // Sign extend the result of subtracting the values.
+  // Cast to unsigned int and then left shift to avoid undefined behavior with
+  // negative values. Cast to int to do the sign extension through right shift.
+  // This requires the right shift of a signed integer be an arithmetic shift,
+  // which is true for clang, gcc, and Visual C++.
+  // These two casts do not generate extra instructions.
+  // Don't use LeftShift(diff) since a valid diff may fail its assertions.
+  // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less
+  // than the minimum allowed value of LeftShift() which is -8.
+  // The next 3 lines are equivalent to:
+  // const int order_hint_bits = Mod32(32 - order_hint_shift_bits);
+  // const int m = (1 << order_hint_bits) >> 1;
+  // return (diff & (m - 1)) - (diff & m);
+  return static_cast<int>(static_cast<unsigned int>(diff)
+                          << order_hint_shift_bits) >>
+         order_hint_shift_bits;
+}
+
+// Applies |sign| (must be 0 or -1) to |value|, i.e.,
+//   return (sign == 0) ? value : -value;
+// and does so without a branch.
+constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; }
+
+// 7.9.3. (without the clamp for numerator and denominator).
+inline void GetMvProjection(const MotionVector& mv, int numerator,
+                            int division_multiplier,
+                            MotionVector* projection_mv) {
+  // Allow numerator and to be 0 so that this function can be called
+  // unconditionally. When numerator is 0, |projection_mv| will be 0, and this
+  // is what we want.
+  assert(std::abs(numerator) <= kMaxFrameDistance);
+  for (int i = 0; i < 2; ++i) {
+    projection_mv->mv[i] =
+        Clip3(RightShiftWithRoundingSigned(
+                  mv.mv[i] * numerator * division_multiplier, 14),
+              -kProjectionMvClamp, kProjectionMvClamp);
+  }
+}
+
+// 7.9.4.
+constexpr int Project(int value, int delta, int dst_sign) {
+  return value + ApplySign(delta / 64, dst_sign);
+}
+
+inline bool IsBlockSmallerThan8x8(BlockSize size) {
+  return size < kBlock8x8 && size != kBlock4x16;
+}
+
+// Returns true if the either the width or the height of the block is equal to
+// four.
+inline bool IsBlockDimension4(BlockSize size) {
+  return size < kBlock8x8 || size == kBlock16x4;
+}
+
+// Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively.
+constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; }
+
+// Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps
+// to 0, kTransformSize8x8 maps to 1 and so on.
+inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) {
+  assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]);
+
+  // The values of the square transform sizes happen to be in the right
+  // ranges, so we can just divide them by 4 to get the indexes.
+  static_assert(
+      std::is_unsigned<std::underlying_type<TransformSize>::type>::value, "");
+  static_assert(kTransformSize4x4 < 4, "");
+  static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, "");
+  static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, "");
+  static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, "");
+  static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, "");
+  return DivideBy4(tx_size);
+}
+
+// Gets the corresponding Y/U/V position, to set and get filter masks
+// in deblock filtering.
+// Returns luma_position if it's Y plane, whose subsampling must be 0.
+// Returns the odd position for U/V plane, if there is subsampling.
+constexpr int GetDeblockPosition(const int luma_position,
+                                 const int subsampling) {
+  return luma_position | subsampling;
+}
+
+// Returns the size of the residual buffer required to hold the residual values
+// for a block or frame of size |rows| by |columns| (taking into account
+// |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the
+// number of bytes required to represent one residual value.
+inline size_t GetResidualBufferSize(const int rows, const int columns,
+                                    const int subsampling_x,
+                                    const int subsampling_y,
+                                    const size_t residual_size) {
+  // The subsampling multipliers are:
+  //   Both x and y are subsampled: 3 / 2.
+  //   Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2).
+  //   Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2).
+  // So we compute the final subsampling multiplier as follows:
+  //   multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2.
+  // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks
+  // when parsing quantized coefficients.
+  const int subsampling_multiplier_num =
+      2 + (4 >> subsampling_x >> subsampling_y);
+  const int number_elements =
+      (rows * columns * subsampling_multiplier_num) >> 1;
+  const int tx_padding = 32 * kResidualPaddingVertical;
+  return residual_size * (number_elements + tx_padding);
+}
+
+// This function is equivalent to:
+// std::min({kTransformWidthLog2[tx_size] - 2,
+//           kTransformWidthLog2[left_tx_size] - 2,
+//           2});
+constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth(
+    TransformSize tx_size, TransformSize left_tx_size) {
+  return static_cast<LoopFilterTransformSizeId>(
+      static_cast<int>(tx_size > kTransformSize4x16 &&
+                       left_tx_size > kTransformSize4x16) +
+      static_cast<int>(tx_size > kTransformSize8x32 &&
+                       left_tx_size > kTransformSize8x32));
+}
+
+// This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve
+// filters.
+inline int GetFilterIndex(const int filter_index, const int length) {
+  if (length <= 4) {
+    if (filter_index == kInterpolationFilterEightTap ||
+        filter_index == kInterpolationFilterEightTapSharp) {
+      return 4;
+    }
+    if (filter_index == kInterpolationFilterEightTapSmooth) {
+      return 5;
+    }
+  }
+  return filter_index;
+}
+
+// This has identical results as RightShiftWithRounding since |subsampling| can
+// only be 0 or 1.
+constexpr int SubsampledValue(int value, int subsampling) {
+  return (value + subsampling) >> subsampling;
+}
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_COMMON_H_
diff --git a/src/utils/common_test.cc b/src/utils/common_test.cc
new file mode 100644 (file)
index 0000000..fdb218d
--- /dev/null
@@ -0,0 +1,604 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/common.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+int BitLength(int64_t n) {
+  int count = 0;
+  while (n != 0) {
+    ++count;
+    n >>= 1;
+  }
+  return count;
+}
+
+TEST(CommonUtilsTest, Align) {
+  for (int i = 0; i <= 8; ++i) {
+    const int alignment = 1 << i;
+    SCOPED_TRACE("alignment: " + std::to_string(alignment));
+    EXPECT_EQ(Align(0, alignment), 0);
+    EXPECT_EQ(Align(1, alignment), alignment);
+    EXPECT_EQ(Align(alignment + 1, alignment), 2 * alignment);
+    if (i > 1) {
+      EXPECT_EQ(Align(alignment - 1, alignment), alignment);
+      EXPECT_EQ(Align(2 * alignment - 1, alignment), 2 * alignment);
+    }
+  }
+}
+
+TEST(CommonUtilsTest, AlignAddr) {
+  auto buf = MakeAlignedUniquePtr<uint8_t>(/*alignment=*/1024, 512);
+  ASSERT_NE(buf, nullptr);
+  auto* const bufptr = buf.get();
+  ASSERT_EQ(reinterpret_cast<uintptr_t>(bufptr) % 1024, 0);
+
+  for (int i = 0; i <= 8; ++i) {
+    const int alignment = 1 << i;
+    ASSERT_LE(alignment, 1024);
+    SCOPED_TRACE("alignment: " + std::to_string(alignment));
+    EXPECT_EQ(AlignAddr(nullptr, alignment), nullptr);
+    EXPECT_EQ(AlignAddr(bufptr, alignment), bufptr);
+    EXPECT_EQ(AlignAddr(bufptr + 1, alignment), bufptr + alignment);
+    EXPECT_EQ(AlignAddr(bufptr + alignment + 1, alignment),
+              bufptr + 2 * alignment);
+    if (i > 1) {
+      EXPECT_EQ(AlignAddr(bufptr + alignment - 1, alignment),
+                bufptr + alignment);
+      EXPECT_EQ(AlignAddr(bufptr + 2 * alignment - 1, alignment),
+                bufptr + 2 * alignment);
+    }
+  }
+}
+
+TEST(CommonUtilsTest, Clip3) {
+  // Value <= lower boundary.
+  EXPECT_EQ(Clip3(10, 20, 30), 20);
+  EXPECT_EQ(Clip3(20, 20, 30), 20);
+  // Value >= higher boundary.
+  EXPECT_EQ(Clip3(40, 20, 30), 30);
+  EXPECT_EQ(Clip3(30, 20, 30), 30);
+  // Value within boundary.
+  EXPECT_EQ(Clip3(25, 20, 30), 25);
+  // Clipping based on bitdepth (clamp between 0 and 2^bitdepth - 1). Make sure
+  // that the resulting values are always in the pixel range for the
+  // corresponding bitdepth.
+  static constexpr int bitdepths[] = {8, 10, 12};
+  static constexpr int pixels[] = {100, 500, 5000, -100, -500, -5000};
+  for (const auto& bitdepth : bitdepths) {
+    for (const auto& pixel : pixels) {
+      const int clipped_pixel = Clip3(pixel, 0, (1 << bitdepth) - 1);
+      EXPECT_GE(clipped_pixel, 0)
+          << "Clip3 mismatch for bitdepth: " << bitdepth << " pixel: " << pixel;
+      EXPECT_LE(clipped_pixel, (1 << bitdepth) - 1)
+          << "Clip3 mismatch for bitdepth: " << bitdepth << " pixel: " << pixel;
+    }
+  }
+}
+
+template <typename Pixel>
+void TestExtendLine(int width, const int left, int right, Pixel left_value,
+                    Pixel right_value) {
+  constexpr int size = 1000;
+  ASSERT_LE(width + left + right, size);
+  Pixel line[size];
+  Pixel* line_start = line + left;
+  line_start[0] = left_value;
+  line_start[width - 1] = right_value;
+  ExtendLine<Pixel>(line_start, width, left, right);
+  for (int x = 0; x < left; x++) {
+    EXPECT_EQ(left_value, line[x]) << "Left side mismatch at x: " << x;
+  }
+  for (int x = 0; x < right; x++) {
+    EXPECT_EQ(right_value, line[left + width + x])
+        << "Right side mismatch at x: " << x;
+  }
+}
+
+TEST(CommonUtilsTest, ExtendLine) {
+  TestExtendLine<uint8_t>(300, 0, 0, 31, 13);
+  TestExtendLine<uint8_t>(100, 10, 20, 31, 13);
+  TestExtendLine<uint8_t>(257, 31, 77, 59, 255);
+  TestExtendLine<uint16_t>(600, 0, 0, 1234, 4321);
+  TestExtendLine<uint16_t>(200, 55, 88, 12345, 54321);
+  TestExtendLine<uint16_t>(2, 99, 333, 257, 513);
+}
+
+template <typename T>
+void TestMemSetBlock(int rows, int columns, ptrdiff_t stride, T value) {
+  constexpr int size = 1000;
+  T block[size];
+  static_assert(sizeof(T) == 1, "");
+  ASSERT_LE(rows * stride, size);
+  ASSERT_LE(columns, stride);
+  MemSetBlock<T>(rows, columns, value, block, stride);
+  for (int y = 0; y < rows; y++) {
+    for (int x = 0; x < columns; x++) {
+      EXPECT_EQ(value, block[y * stride + x])
+          << "Mismatch at y: " << y << " x: " << x;
+    }
+  }
+}
+
+TEST(CommonUtilsTest, MemSetBlock) {
+  TestMemSetBlock<bool>(15, 28, 29, true);
+  TestMemSetBlock<bool>(17, 1, 24, false);
+  TestMemSetBlock<bool>(7, 2, 13, true);
+  TestMemSetBlock<int8_t>(35, 17, 19, 123);
+  TestMemSetBlock<uint8_t>(19, 16, 16, 234);
+}
+
+template <typename T>
+void TestSetBlock(int rows, int columns, ptrdiff_t stride, T value) {
+  constexpr int size = 1000;
+  T block[size];
+  ASSERT_LE(rows * stride, size);
+  ASSERT_LE(columns, stride);
+  SetBlock<T>(rows, columns, value, block, stride);
+  for (int y = 0; y < rows; y++) {
+    for (int x = 0; x < columns; x++) {
+      EXPECT_EQ(value, block[y * stride + x])
+          << "Mismatch at y: " << y << " x: " << x;
+    }
+  }
+}
+
+TEST(CommonUtilsTest, SetBlock) {
+  // Test 1-byte block set.
+  TestSetBlock<bool>(15, 28, 29, true);
+  TestSetBlock<bool>(17, 1, 24, false);
+  TestSetBlock<bool>(7, 2, 13, true);
+  TestSetBlock<int8_t>(35, 17, 19, 123);
+  TestSetBlock<uint8_t>(19, 16, 16, 234);
+  // Test 2-byte block set.
+  TestSetBlock<int16_t>(23, 27, 28, 1234);
+  TestSetBlock<uint16_t>(13, 39, 44, 4321);
+  // Test 4-byte block set.
+  TestSetBlock<int>(14, 7, 7, 12345);
+  TestSetBlock<int>(33, 4, 15, 54321);
+  // Test pointer block set.
+  int data;
+  TestSetBlock<int*>(23, 8, 25, &data);
+}
+
+TEST(CommonUtilsTest, CountTrailingZeros) {
+  EXPECT_EQ(CountTrailingZeros(0x1), 0);
+  EXPECT_EQ(CountTrailingZeros(0x3), 0);
+  EXPECT_EQ(CountTrailingZeros(0x7), 0);
+  EXPECT_EQ(CountTrailingZeros(0xF), 0);
+  EXPECT_EQ(CountTrailingZeros(0x2), 1);
+  EXPECT_EQ(CountTrailingZeros(0x6), 1);
+  EXPECT_EQ(CountTrailingZeros(0xE), 1);
+  EXPECT_EQ(CountTrailingZeros(0x4), 2);
+  EXPECT_EQ(CountTrailingZeros(0xC), 2);
+  EXPECT_EQ(CountTrailingZeros(0x8), 3);
+  EXPECT_EQ(CountTrailingZeros(0x10), 4);
+  EXPECT_EQ(CountTrailingZeros(0x30), 4);
+  EXPECT_EQ(CountTrailingZeros(0x70), 4);
+  EXPECT_EQ(CountTrailingZeros(0xF0), 4);
+  EXPECT_EQ(CountTrailingZeros(0x20), 5);
+  EXPECT_EQ(CountTrailingZeros(0x60), 5);
+  EXPECT_EQ(CountTrailingZeros(0xE0), 5);
+  EXPECT_EQ(CountTrailingZeros(0x40), 6);
+  EXPECT_EQ(CountTrailingZeros(0xC0), 6);
+  EXPECT_EQ(CountTrailingZeros(0x80), 7);
+  EXPECT_EQ(CountTrailingZeros(0x31), 0);
+  EXPECT_EQ(CountTrailingZeros(0x32), 1);
+  EXPECT_EQ(CountTrailingZeros(0x34), 2);
+  EXPECT_EQ(CountTrailingZeros(0x38), 3);
+  EXPECT_EQ(CountTrailingZeros(0x310), 4);
+  EXPECT_EQ(CountTrailingZeros(0x320), 5);
+  EXPECT_EQ(CountTrailingZeros(0x340), 6);
+  EXPECT_EQ(CountTrailingZeros(0x380), 7);
+}
+
+TEST(CommonUtilsTest, FloorLog2) {
+  // Powers of 2.
+  EXPECT_EQ(FloorLog2(1), 0);
+  EXPECT_EQ(FloorLog2(2), 1);
+  EXPECT_EQ(FloorLog2(8), 3);
+  EXPECT_EQ(FloorLog2(64), 6);
+  // Powers of 2 +/- 1.
+  EXPECT_EQ(FloorLog2(9), 3);
+  EXPECT_EQ(FloorLog2(15), 3);
+  EXPECT_EQ(FloorLog2(63), 5);
+  // Large value, smaller than 32 bit.
+  EXPECT_EQ(FloorLog2(0x7fffffff), 30);
+  EXPECT_EQ(FloorLog2(0x80000000), 31);
+  // Larger than 32 bit.
+  EXPECT_EQ(FloorLog2(uint64_t{0x7fffffffffffffff}), 62);
+  EXPECT_EQ(FloorLog2(uint64_t{0x8000000000000000}), 63);
+  EXPECT_EQ(FloorLog2(uint64_t{0xffffffffffffffff}), 63);
+}
+
+TEST(CommonUtilsTest, CeilLog2) {
+  // Even though log2(0) is -inf, here we explicitly define it to be 0.
+  EXPECT_EQ(CeilLog2(0), 0);
+  // Powers of 2.
+  EXPECT_EQ(CeilLog2(1), 0);
+  EXPECT_EQ(CeilLog2(2), 1);
+  EXPECT_EQ(CeilLog2(8), 3);
+  EXPECT_EQ(CeilLog2(64), 6);
+  // Powers of 2 +/- 1.
+  EXPECT_EQ(CeilLog2(9), 4);
+  EXPECT_EQ(CeilLog2(15), 4);
+  EXPECT_EQ(CeilLog2(63), 6);
+  // Large value.
+  EXPECT_EQ(CeilLog2(0x7fffffff), 31);
+}
+
+TEST(CommonUtilsTest, RightShiftWithCeiling) {
+  // Shift 1 bit.
+  EXPECT_EQ(RightShiftWithCeiling(1, 1), 1);
+  EXPECT_EQ(RightShiftWithCeiling(2, 1), 1);
+  EXPECT_EQ(RightShiftWithCeiling(3, 1), 2);
+  EXPECT_EQ(RightShiftWithCeiling(4, 1), 2);
+  EXPECT_EQ(RightShiftWithCeiling(5, 1), 3);
+  // Shift 2 bits.
+  EXPECT_EQ(RightShiftWithCeiling(1, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(2, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(3, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(4, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(5, 2), 2);
+  // Shift 20 bits.
+  EXPECT_EQ(RightShiftWithCeiling(1, 20), 1);
+  EXPECT_EQ(RightShiftWithCeiling((1 << 20) - 1, 20), 1);
+  EXPECT_EQ(RightShiftWithCeiling(1 << 20, 20), 1);
+  EXPECT_EQ(RightShiftWithCeiling((1 << 20) + 1, 20), 2);
+  EXPECT_EQ(RightShiftWithCeiling((1 << 21) - 1, 20), 2);
+}
+
+template <typename Input, typename Output>
+void VerifyRightShiftWithRounding(const Input* const values,
+                                  const int* const bits,
+                                  const Output* const rounded_values,
+                                  size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    const Output rounded_value = RightShiftWithRounding(values[i], bits[i]);
+    EXPECT_EQ(rounded_value, rounded_values[i]) << "Mismatch at index " << i;
+    // Rounding reduces the bit length by |bits[i]| - 1.
+    EXPECT_LE(BitLength(rounded_value), BitLength(values[i]) - (bits[i] - 1))
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingInt32) {
+  static constexpr int32_t values[] = {5, 203, 204, 255, 40000, 50000};
+  static constexpr int bits[] = {0, 3, 3, 3, 12, 12};
+  static constexpr int32_t rounded_values[] = {5, 25, 26, 32, 10, 12};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRounding<int32_t, int32_t>(values, bits, rounded_values,
+                                                 ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingUint32) {
+  static constexpr uint32_t values[] = {5,     203,   204,       255,
+                                        40000, 50000, 0x7fffffff};
+  static constexpr int bits[] = {0, 3, 3, 3, 12, 12, 20};
+  static constexpr uint32_t rounded_values[] = {5, 25, 26, 32, 10, 12, 2048};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRounding<uint32_t, uint32_t>(values, bits, rounded_values,
+                                                   ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingInt64) {
+  static constexpr int64_t values[] = {5,     203,   204,        255,
+                                       40000, 50000, 0x7fffffff, 0x8fffffff};
+  static constexpr int bits[] = {0, 3, 3, 3, 12, 12, 20, 20};
+  static constexpr int32_t rounded_values[] = {5,  25, 26,   32,
+                                               10, 12, 2048, 2304};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRounding<int64_t, int32_t>(values, bits, rounded_values,
+                                                 ABSL_ARRAYSIZE(values));
+}
+
+template <typename Input>
+void VerifyRightShiftWithRoundingSigned(const Input* const values,
+                                        const int* const bits,
+                                        const int32_t* const rounded_values,
+                                        int count) {
+  for (int i = 0; i < count; ++i) {
+    int32_t rounded_value = RightShiftWithRoundingSigned(values[i], bits[i]);
+    EXPECT_EQ(rounded_value, rounded_values[i]) << "Mismatch at index " << i;
+    rounded_value = RightShiftWithRoundingSigned(-values[i], bits[i]);
+    EXPECT_EQ(rounded_value, -rounded_values[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingSignedInt32) {
+  static constexpr int32_t values[] = {203, 204, 255, 40000, 50000};
+  static constexpr int bits[] = {3, 3, 3, 12, 12};
+  static constexpr int32_t rounded_values[] = {25, 26, 32, 10, 12};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRoundingSigned<int32_t>(values, bits, rounded_values,
+                                              ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingSignedInt64) {
+  static constexpr int64_t values[] = {203,   204,        255,       40000,
+                                       50000, 0x7fffffff, 0x8fffffff};
+  static constexpr int bits[] = {3, 3, 3, 12, 12, 20, 20};
+  static constexpr int32_t rounded_values[] = {25, 26, 32, 10, 12, 2048, 2304};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRoundingSigned<int64_t>(values, bits, rounded_values,
+                                              ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, GetResidualBufferSize) {
+  // No subsampling.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 0, 0, 2),
+            /* 2*(64*64*3/1 + 32*4) = */ 24832);
+  // Only X is subsampled.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 1, 0, 2),
+            /* 2*(64*64*2/1 + 32*4) = */ 16640);
+  // Only Y is subsampled.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 0, 1, 2),
+            /* 2*(64*64*2/1 + 32*4) = */ 16640);
+  // Both X and Y are subsampled.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 1, 1, 2),
+            /* 2*(64*64*3/2 + 32*4) = */ 12544);
+}
+
+//------------------------------------------------------------------------------
+// Tests for bitstream util functions
+
+TEST(BitstreamUtilTest, IsIntraFrame) {
+  EXPECT_TRUE(IsIntraFrame(kFrameKey));
+  EXPECT_TRUE(IsIntraFrame(kFrameIntraOnly));
+  EXPECT_FALSE(IsIntraFrame(kFrameInter));
+  EXPECT_FALSE(IsIntraFrame(kFrameSwitch));
+}
+
+TEST(BitstreamUtilTest, GetTransformClass) {
+  static constexpr TransformClass expected_classes[kNumTransformTypes] = {
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClassVertical, kTransformClassHorizontal,
+      kTransformClassVertical, kTransformClassHorizontal,
+      kTransformClassVertical, kTransformClassHorizontal,
+  };
+  for (int i = 0; i < kNumTransformTypes; ++i) {
+    EXPECT_EQ(GetTransformClass(static_cast<TransformType>(i)),
+              expected_classes[i])
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(BitstreamUtilTest, RowOrColumn4x4ToPixel) {
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneY, 0), 40);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneY, 1),
+            40);  // Subsampling should have no effect on Y plane.
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneU, 0), 40);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneU, 1), 20);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneV, 0), 40);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneV, 1), 20);
+}
+
+TEST(BitstreamUtilTest, GetPlaneType) {
+  EXPECT_EQ(GetPlaneType(kPlaneY), kPlaneTypeY);
+  EXPECT_EQ(GetPlaneType(kPlaneU), kPlaneTypeUV);
+  EXPECT_EQ(GetPlaneType(kPlaneV), kPlaneTypeUV);
+}
+
+TEST(BitstreamUtils, IsDirectionalMode) {
+  static constexpr bool is_directional_modes[kNumPredictionModes] = {
+      false, true,  true,  true,  true,  true,  true,  true,  true,
+      false, false, false, false, false, false, false, false, false,
+      false, false, false, false, false, false, false, false,
+  };
+  for (int i = 0; i < kNumPredictionModes; ++i) {
+    EXPECT_EQ(IsDirectionalMode(static_cast<PredictionMode>(i)),
+              is_directional_modes[i])
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(BitstreamUtils, GetRelativeDistance) {
+  // Both order_hint_bits and order_hint_shift_bits are zero. (a and b must be
+  // zero.)
+  EXPECT_EQ(GetRelativeDistance(0, 0, 0), 0);
+  EXPECT_EQ(GetRelativeDistance(10, 20, 27), -10);
+
+  EXPECT_EQ(GetRelativeDistance(2, 1, 30), 1);
+  EXPECT_EQ(GetRelativeDistance(2, 1, 29), 1);
+
+  EXPECT_EQ(GetRelativeDistance(1, 2, 30), -1);
+  EXPECT_EQ(GetRelativeDistance(1, 2, 29), -1);
+
+  // With an order_hint_bits of 4 and an order_hint_shift_bits of 28, 16 is the
+  // same as 0, 17 is the same as 1, etc. The most positive distance is 7, and
+  // the most negative distance is -8.
+
+  EXPECT_EQ(GetRelativeDistance(2, 6, 28), -4);
+  EXPECT_EQ(GetRelativeDistance(6, 2, 28), 4);
+  // 18 - 14 = 4.
+  EXPECT_EQ(GetRelativeDistance(2, 14, 28), 4);
+  // 14 - 18 = -4.
+  EXPECT_EQ(GetRelativeDistance(14, 2, 28), -4);
+  // If a and b are exactly 8 apart, GetRelativeDistance() cannot tell whether
+  // a is before or after b. GetRelativeDistance(a, b) and
+  // GetRelativeDistance(b, a) are both -8.
+  // 1 - 9 = -8.
+  EXPECT_EQ(GetRelativeDistance(1, 9, 28), -8);
+  // 9 - 17 = -8.
+  EXPECT_EQ(GetRelativeDistance(9, 1, 28), -8);
+
+  // With an order_hint_bits of 5 and an order_hint_shift_bits of 27, 32 is the
+  // same as 0, 33 is the same as 1, etc. The most positive distance is 15, and
+  // the most negative distance is -16.
+
+  // 31 - 32 = -1.
+  EXPECT_EQ(GetRelativeDistance(31, 0, 27), -1);
+  // 32 - 31 = 1.
+  EXPECT_EQ(GetRelativeDistance(0, 31, 27), 1);
+  // 30 - 33 = -3.
+  EXPECT_EQ(GetRelativeDistance(30, 1, 27), -3);
+  // 33 - 30 = 3.
+  EXPECT_EQ(GetRelativeDistance(1, 30, 27), 3);
+  // 25 - 36 = -11.
+  EXPECT_EQ(GetRelativeDistance(25, 4, 27), -11);
+  // 36 - 25 = 11.
+  EXPECT_EQ(GetRelativeDistance(4, 25, 27), 11);
+  // 15 - 0 = 15.
+  EXPECT_EQ(GetRelativeDistance(15, 0, 27), 15);
+  // If a and b are exactly 16 apart, GetRelativeDistance() cannot tell whether
+  // a is before or after b. GetRelativeDistance(a, b) and
+  // GetRelativeDistance(b, a) are both -16.
+  // 16 - 32 = -16.
+  EXPECT_EQ(GetRelativeDistance(16, 0, 27), -16);
+  // 0 - 16 = -16.
+  EXPECT_EQ(GetRelativeDistance(0, 16, 27), -16);
+}
+
+TEST(BitstreamUtils, ApplySign) {
+  // ApplyPositive(0) = 0
+  EXPECT_EQ(ApplySign(0, 0), 0);
+  // ApplyNegative(0) = 0
+  EXPECT_EQ(ApplySign(0, -1), 0);
+
+  // ApplyPositive(1) = 1
+  EXPECT_EQ(ApplySign(1, 0), 1);
+  // ApplyNegative(1) = -1
+  EXPECT_EQ(ApplySign(1, -1), -1);
+
+  // ApplyPositive(-1) = -1
+  EXPECT_EQ(ApplySign(-1, 0), -1);
+  // ApplyNegative(-1) = 1
+  EXPECT_EQ(ApplySign(-1, -1), 1);
+
+  // ApplyPositive(1234) = 1234
+  EXPECT_EQ(ApplySign(1234, 0), 1234);
+  // ApplyNegative(1234) = -1234
+  EXPECT_EQ(ApplySign(1234, -1), -1234);
+
+  // ApplyPositive(-1234) = -1234
+  EXPECT_EQ(ApplySign(-1234, 0), -1234);
+  // ApplyNegative(-1234) = 1234
+  EXPECT_EQ(ApplySign(-1234, -1), 1234);
+}
+
+// 7.9.3. (without the clamp for numerator and denominator).
+int SpecGetMvProjectionKernel(int mv, int numerator, int denominator) {
+  int value = mv * numerator * kProjectionMvDivisionLookup[denominator];
+  if (value >= 0) {
+    value += 1 << 13;
+    value >>= 14;
+  } else {
+    value = -value;
+    value += 1 << 13;
+    value >>= 14;
+    value = -value;
+  }
+  if (value < (-(1 << 14) + 1)) value = -(1 << 14) + 1;
+  if (value > (1 << 14) - 1) value = (1 << 14) - 1;
+  return value;
+}
+
+void SpecGetMvProjectionNoClamp(const MotionVector& mv, int numerator,
+                                int denominator, MotionVector* projection_mv) {
+  for (int i = 0; i < 2; ++i) {
+    projection_mv->mv[i] =
+        SpecGetMvProjectionKernel(mv.mv[i], numerator, denominator);
+  }
+}
+
+TEST(BitstreamUtils, GetMvProjection) {
+  const int16_t mvs[5][2] = {
+      {0, 0}, {11, 73}, {-84, 272}, {733, -827}, {-472, -697}};
+  for (auto& mv_value : mvs) {
+    for (int numerator = -kMaxFrameDistance; numerator <= kMaxFrameDistance;
+         ++numerator) {
+      for (int denominator = 0; denominator <= kMaxFrameDistance;
+           ++denominator) {
+        MotionVector mv, projection_mv, spec_projection_mv;
+        mv.mv[0] = mv_value[0];
+        mv.mv[1] = mv_value[1];
+        GetMvProjection(mv, numerator, kProjectionMvDivisionLookup[denominator],
+                        &projection_mv);
+        SpecGetMvProjectionNoClamp(mv, numerator, denominator,
+                                   &spec_projection_mv);
+        EXPECT_EQ(projection_mv.mv32, spec_projection_mv.mv32);
+      }
+    }
+  }
+}
+
+// 7.9.4.
+int SpecProject(int value, int delta, int dst_sign) {
+  constexpr int kMiSizeLog2 = 2;
+  const int sign = (dst_sign == 0) ? 1 : dst_sign;
+  int offset;
+  if (delta >= 0) {
+    offset = delta >> (3 + 1 + kMiSizeLog2);
+  } else {
+    offset = -((-delta) >> (3 + 1 + kMiSizeLog2));
+  }
+  return value + sign * offset;
+}
+
+TEST(BitstreamUtils, Project) {
+  for (int value = -10; value <= 10; ++value) {
+    for (int delta = -256; delta <= 256; ++delta) {
+      for (int dst_sign = -1; dst_sign <= 0; ++dst_sign) {
+        EXPECT_EQ(Project(value, delta, dst_sign),
+                  SpecProject(value, delta, dst_sign));
+      }
+    }
+  }
+}
+
+TEST(BitstreamUtils, IsBlockSmallerThan8x8) {
+  static constexpr bool is_block_smaller_than8x8[kMaxBlockSizes] = {
+      true,  true,  false, true,  false, false, false, false,
+      false, false, false, false, false, false, false, false,
+      false, false, false, false, false, false,
+  };
+  for (int i = 0; i < kMaxBlockSizes; ++i) {
+    EXPECT_EQ(IsBlockSmallerThan8x8(static_cast<BlockSize>(i)),
+              is_block_smaller_than8x8[i])
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(BitstreamUtils, TransformSizeToSquareTransformIndex) {
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize4x4), 0);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize8x8), 1);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize16x16), 2);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize32x32), 3);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize64x64), 4);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/compiler_attributes.h b/src/utils/compiler_attributes.h
new file mode 100644 (file)
index 0000000..09f0035
--- /dev/null
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+#define LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+
+// A collection of compiler attribute checks and defines to control for
+// compatibility across toolchains.
+
+//------------------------------------------------------------------------------
+// Language version, attribute and feature helpers.
+
+// Detect c++17 support. Visual Studio sets __cplusplus to 199711L by default
+// unless compiled with /Zc:__cplusplus, use the value controlled by /std
+// instead.
+// https://docs.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define LIBGAV1_CXX17 1
+#else
+#define LIBGAV1_CXX17 0
+#endif
+
+#if defined(__has_attribute)
+#define LIBGAV1_HAS_ATTRIBUTE __has_attribute
+#else
+#define LIBGAV1_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__has_feature)
+#define LIBGAV1_HAS_FEATURE __has_feature
+#else
+#define LIBGAV1_HAS_FEATURE(x) 0
+#endif
+
+//------------------------------------------------------------------------------
+// Sanitizer attributes.
+
+#if LIBGAV1_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define LIBGAV1_ASAN 1
+#else
+#define LIBGAV1_ASAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(memory_sanitizer)
+#define LIBGAV1_MSAN 1
+#else
+#define LIBGAV1_MSAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__)
+#define LIBGAV1_TSAN 1
+#else
+#define LIBGAV1_TSAN 0
+#endif
+
+//------------------------------------------------------------------------------
+// AddressSanitizer support.
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if LIBGAV1_ASAN
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+  (static_cast<void>(addr), static_cast<void>(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+  (static_cast<void>(addr), static_cast<void>(size))
+#endif
+
+//------------------------------------------------------------------------------
+// Function attributes.
+// GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
+// Clang: https://clang.llvm.org/docs/AttributeReference.html
+
+#if defined(__GNUC__)
+#define LIBGAV1_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define LIBGAV1_ALWAYS_INLINE __forceinline
+#else
+#define LIBGAV1_ALWAYS_INLINE inline
+#endif
+
+// LIBGAV1_MUST_USE_RESULT
+//
+// Tells the compiler to warn about unused results.
+//
+// When annotating a function, it must appear as the first part of the
+// declaration or definition. The compiler will warn if the return value from
+// such a function is unused:
+//
+//   LIBGAV1_MUST_USE_RESULT Sprocket* AllocateSprocket();
+//   AllocateSprocket();  // Triggers a warning.
+//
+// When annotating a class, it is equivalent to annotating every function which
+// returns an instance.
+//
+//   class LIBGAV1_MUST_USE_RESULT Sprocket {};
+//   Sprocket();  // Triggers a warning.
+//
+//   Sprocket MakeSprocket();
+//   MakeSprocket();  // Triggers a warning.
+//
+// Note that references and pointers are not instances:
+//
+//   Sprocket* SprocketPointer();
+//   SprocketPointer();  // Does *not* trigger a warning.
+//
+// LIBGAV1_MUST_USE_RESULT allows using cast-to-void to suppress the unused
+// result warning. For that, warn_unused_result is used only for clang but not
+// for gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
+#if LIBGAV1_HAS_ATTRIBUTE(nodiscard)
+#define LIBGAV1_MUST_USE_RESULT [[nodiscard]]
+#elif defined(__clang__) && LIBGAV1_HAS_ATTRIBUTE(warn_unused_result)
+#define LIBGAV1_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define LIBGAV1_MUST_USE_RESULT
+#endif
+
+// LIBGAV1_PRINTF_ATTRIBUTE
+//
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+//
+// Note: As the GCC manual states, "[s]ince non-static C++ methods
+// have an implicit 'this' argument, the arguments of such methods
+// should be counted from two, not one."
+#if LIBGAV1_HAS_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__))
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#else
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#endif
+
+//------------------------------------------------------------------------------
+// Thread annotations.
+
+// LIBGAV1_GUARDED_BY()
+//
+// Documents if a shared field or global variable needs to be protected by a
+// mutex. LIBGAV1_GUARDED_BY() allows the user to specify a particular mutex
+// that should be held when accessing the annotated variable.
+//
+// Although this annotation cannot be applied to local variables, a local
+// variable and its associated mutex can often be combined into a small class
+// or struct, thereby allowing the annotation.
+//
+// Example:
+//
+//   class Foo {
+//     Mutex mu_;
+//     int p1_ LIBGAV1_GUARDED_BY(mu_);
+//     ...
+//   };
+// TODO(b/133245043): this can be reenabled after a local MutexLock
+// implementation is added with proper thread annotations.
+#if 0  // LIBGAV1_HAS_ATTRIBUTE(guarded_by)
+#define LIBGAV1_GUARDED_BY(x) __attribute__((guarded_by(x)))
+#else
+#define LIBGAV1_GUARDED_BY(x)
+#endif
+
+//------------------------------------------------------------------------------
+
+#undef LIBGAV1_HAS_ATTRIBUTE
+#undef LIBGAV1_HAS_FEATURE
+
+#endif  // LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
diff --git a/src/utils/constants.cc b/src/utils/constants.cc
new file mode 100644 (file)
index 0000000..80d7acb
--- /dev/null
@@ -0,0 +1,874 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+const uint8_t k4x4WidthLog2[kMaxBlockSizes] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+                                               2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5};
+
+const uint8_t k4x4HeightLog2[kMaxBlockSizes] = {
+    0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 5, 4, 5};
+
+const uint8_t kNum4x4BlocksWide[kMaxBlockSizes] = {
+    1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32};
+
+const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes] = {
+    1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16, 32, 16, 32};
+
+const uint8_t kBlockWidthPixels[kMaxBlockSizes] = {
+    4,  4,  4,  8,  8,  8,  8,  16, 16, 16,  16,
+    16, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128};
+
+const uint8_t kBlockHeightPixels[kMaxBlockSizes] = {
+    4,  8, 16, 4,  8,  16, 32, 4,  8,   16, 32,
+    64, 8, 16, 32, 64, 16, 32, 64, 128, 64, 128};
+
+// 9.3 -- Partition_Subsize[]
+const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes] = {
+    // kPartitionNone
+    {kBlock4x4,     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x64,   kBlockInvalid,
+     kBlockInvalid, kBlock128x128},
+    // kPartitionHorizontal
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionVertical
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,   kBlockInvalid,
+     kBlockInvalid, kBlock64x64},
+    // kPartitionHorizontalWithTopSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionHorizontalWithBottomSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionVerticalWithLeftSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionVerticalWithRightSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionHorizontal4
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x16,   kBlockInvalid,
+     kBlockInvalid, kBlockInvalid},
+    // kPartitionVertical4
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x64,   kBlockInvalid,
+     kBlockInvalid, kBlockInvalid}};
+
+// 5.11.38 (implemented as a simple look up. first dimension is block size,
+// second and third are subsampling_x and subsampling_y).
+const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2] = {
+    {{kBlock4x4, kBlock4x4}, {kBlock4x4, kBlock4x4}},
+    {{kBlock4x8, kBlock4x4}, {kBlockInvalid, kBlock4x4}},
+    {{kBlock4x16, kBlock4x8}, {kBlockInvalid, kBlock4x8}},
+    {{kBlock8x4, kBlockInvalid}, {kBlock4x4, kBlock4x4}},
+    {{kBlock8x8, kBlock8x4}, {kBlock4x8, kBlock4x4}},
+    {{kBlock8x16, kBlock8x8}, {kBlockInvalid, kBlock4x8}},
+    {{kBlock8x32, kBlock8x16}, {kBlockInvalid, kBlock4x16}},
+    {{kBlock16x4, kBlockInvalid}, {kBlock8x4, kBlock8x4}},
+    {{kBlock16x8, kBlockInvalid}, {kBlock8x8, kBlock8x4}},
+    {{kBlock16x16, kBlock16x8}, {kBlock8x16, kBlock8x8}},
+    {{kBlock16x32, kBlock16x16}, {kBlockInvalid, kBlock8x16}},
+    {{kBlock16x64, kBlock16x32}, {kBlockInvalid, kBlock8x32}},
+    {{kBlock32x8, kBlockInvalid}, {kBlock16x8, kBlock16x4}},
+    {{kBlock32x16, kBlockInvalid}, {kBlock16x16, kBlock16x8}},
+    {{kBlock32x32, kBlock32x16}, {kBlock16x32, kBlock16x16}},
+    {{kBlock32x64, kBlock32x32}, {kBlockInvalid, kBlock16x32}},
+    {{kBlock64x16, kBlockInvalid}, {kBlock32x16, kBlock32x8}},
+    {{kBlock64x32, kBlockInvalid}, {kBlock32x32, kBlock32x16}},
+    {{kBlock64x64, kBlock64x32}, {kBlock32x64, kBlock32x32}},
+    {{kBlock64x128, kBlock64x64}, {kBlockInvalid, kBlock32x64}},
+    {{kBlock128x64, kBlockInvalid}, {kBlock64x64, kBlock64x32}},
+    {{kBlock128x128, kBlock128x64}, {kBlock64x128, kBlock64x64}}};
+
+const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1] = {
+    0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+    1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
+    744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
+
+const uint8_t kTransformWidth[kNumTransformSizes] = {
+    4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 32, 32, 32, 32, 64, 64, 64};
+
+const uint8_t kTransformHeight[kNumTransformSizes] = {
+    4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, 64, 8, 16, 32, 64, 16, 32, 64};
+
+const uint8_t kTransformWidth4x4[kNumTransformSizes] = {
+    1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16};
+
+const uint8_t kTransformHeight4x4[kNumTransformSizes] = {
+    1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16};
+
+const uint8_t kTransformWidthLog2[kNumTransformSizes] = {
+    2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6};
+
+const uint8_t kTransformHeightLog2[kNumTransformSizes] = {
+    2, 3, 4, 2, 3, 4, 5, 2, 3, 4, 5, 6, 3, 4, 5, 6, 4, 5, 6};
+
+// 9.3 -- Split_Tx_Size[]
+const TransformSize kSplitTransformSize[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize4x8,
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize8x8,
+    kTransformSize8x16,  kTransformSize8x4,   kTransformSize8x8,
+    kTransformSize8x8,   kTransformSize16x16, kTransformSize16x32,
+    kTransformSize16x8,  kTransformSize16x16, kTransformSize16x16,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32};
+
+// Square transform of size min(w,h).
+const TransformSize kTransformSizeSquareMin[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize4x4,
+    kTransformSize4x4,   kTransformSize8x8,   kTransformSize8x8,
+    kTransformSize8x8,   kTransformSize4x4,   kTransformSize8x8,
+    kTransformSize16x16, kTransformSize16x16, kTransformSize16x16,
+    kTransformSize8x8,   kTransformSize16x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize16x16, kTransformSize32x32,
+    kTransformSize64x64};
+
+// Square transform of size max(w,h).
+const TransformSize kTransformSizeSquareMax[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize8x8,   kTransformSize16x16,
+    kTransformSize8x8,   kTransformSize8x8,   kTransformSize16x16,
+    kTransformSize32x32, kTransformSize16x16, kTransformSize16x16,
+    kTransformSize16x16, kTransformSize32x32, kTransformSize64x64,
+    kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+    kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+    kTransformSize64x64};
+
+const uint8_t kNumTransformTypesInSet[kNumTransformSets] = {1, 7, 5, 16, 12, 2};
+
+const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4] = {
+    {2, 12, 1, 4},  {2, 15, 1, 6},  {2, 18, 1, 8},  {2, 21, 1, 9},
+    {2, 24, 1, 10}, {2, 29, 1, 11}, {2, 36, 1, 12}, {2, 45, 1, 13},
+    {2, 56, 1, 14}, {2, 68, 1, 15}, {0, 0, 1, 5},   {0, 0, 1, 8},
+    {0, 0, 1, 11},  {0, 0, 1, 14},  {2, 30, 0, 0},  {2, 75, 0, 0}};
+
+const int8_t kSgrProjMultiplierMin[2] = {-96, -32};
+
+const int8_t kSgrProjMultiplierMax[2] = {31, 95};
+
+const int8_t kWienerTapsMin[3] = {-5, -23, -17};
+
+const int8_t kWienerTapsMax[3] = {10, 8, 46};
+
+// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in
+// order to support 16-bit packed NEON operations.
+// The sign of each tap is: - + - + + - + -
+alignas(16) const uint8_t
+    kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, 128, 0, 0, 0, 0},    {0, 0, 1, 128, 2, 1, 0, 0},
+        {0, 1, 3, 127, 4, 2, 1, 0},    {0, 1, 4, 127, 6, 3, 1, 0},
+        {0, 2, 6, 126, 8, 3, 1, 0},    {0, 2, 7, 125, 11, 4, 1, 0},
+        {1, 2, 8, 125, 13, 5, 2, 0},   {1, 3, 9, 124, 15, 6, 2, 0},
+        {1, 3, 10, 123, 18, 6, 2, 1},  {1, 3, 11, 122, 20, 7, 3, 1},
+        {1, 4, 12, 121, 22, 8, 3, 1},  {1, 4, 13, 120, 25, 9, 3, 1},
+        {1, 4, 14, 118, 28, 9, 3, 1},  {1, 4, 15, 117, 30, 10, 4, 1},
+        {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1},
+        {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1},
+        {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1},
+        {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1},
+        {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1},
+        {1, 6, 20, 97, 58, 17, 6, 1},  {1, 6, 20, 95, 61, 18, 6, 1},
+        {2, 7, 20, 93, 64, 18, 6, 2},  {2, 7, 20, 91, 66, 19, 6, 1},
+        {2, 7, 20, 88, 69, 19, 6, 1},  {2, 7, 20, 86, 71, 19, 6, 1},
+        {2, 7, 20, 84, 74, 20, 7, 2},  {2, 7, 20, 81, 76, 20, 7, 1},
+        {2, 7, 20, 79, 79, 20, 7, 2},  {1, 7, 20, 76, 81, 20, 7, 2},
+        {2, 7, 20, 74, 84, 20, 7, 2},  {1, 6, 19, 71, 86, 20, 7, 2},
+        {1, 6, 19, 69, 88, 20, 7, 2},  {1, 6, 19, 66, 91, 20, 7, 2},
+        {2, 6, 18, 64, 93, 20, 7, 2},  {1, 6, 18, 61, 95, 20, 6, 1},
+        {1, 6, 17, 58, 97, 20, 6, 1},  {1, 6, 17, 56, 99, 20, 6, 1},
+        {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1},
+        {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1},
+        {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1},
+        {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1},
+        {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1},
+        {1, 3, 9, 28, 118, 14, 4, 1},  {1, 3, 9, 25, 120, 13, 4, 1},
+        {1, 3, 8, 22, 121, 12, 4, 1},  {1, 3, 7, 20, 122, 11, 3, 1},
+        {1, 2, 6, 18, 123, 10, 3, 1},  {0, 2, 6, 15, 124, 9, 3, 1},
+        {0, 2, 5, 13, 125, 8, 2, 1},   {0, 1, 4, 11, 125, 7, 2, 0},
+        {0, 1, 3, 8, 126, 6, 2, 0},    {0, 1, 3, 6, 127, 4, 1, 0},
+        {0, 1, 2, 4, 127, 3, 1, 0},    {0, 0, 1, 2, 128, 1, 0, 0},
+};
+
+alignas(8) const int8_t
+    kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+        // [-1, 0).
+        {0, 0, 127, 1, 0, 0, 0, 0},
+        {0, -1, 127, 2, 0, 0, 0, 0},
+        {1, -3, 127, 4, -1, 0, 0, 0},
+        {1, -4, 126, 6, -2, 1, 0, 0},
+        {1, -5, 126, 8, -3, 1, 0, 0},
+        {1, -6, 125, 11, -4, 1, 0, 0},
+        {1, -7, 124, 13, -4, 1, 0, 0},
+        {2, -8, 123, 15, -5, 1, 0, 0},
+        {2, -9, 122, 18, -6, 1, 0, 0},
+        {2, -10, 121, 20, -6, 1, 0, 0},
+        {2, -11, 120, 22, -7, 2, 0, 0},
+        {2, -12, 119, 25, -8, 2, 0, 0},
+        {3, -13, 117, 27, -8, 2, 0, 0},
+        {3, -13, 116, 29, -9, 2, 0, 0},
+        {3, -14, 114, 32, -10, 3, 0, 0},
+        {3, -15, 113, 35, -10, 2, 0, 0},
+        {3, -15, 111, 37, -11, 3, 0, 0},
+        {3, -16, 109, 40, -11, 3, 0, 0},
+        {3, -16, 108, 42, -12, 3, 0, 0},
+        {4, -17, 106, 45, -13, 3, 0, 0},
+        {4, -17, 104, 47, -13, 3, 0, 0},
+        {4, -17, 102, 50, -14, 3, 0, 0},
+        {4, -17, 100, 52, -14, 3, 0, 0},
+        {4, -18, 98, 55, -15, 4, 0, 0},
+        {4, -18, 96, 58, -15, 3, 0, 0},
+        {4, -18, 94, 60, -16, 4, 0, 0},
+        {4, -18, 91, 63, -16, 4, 0, 0},
+        {4, -18, 89, 65, -16, 4, 0, 0},
+        {4, -18, 87, 68, -17, 4, 0, 0},
+        {4, -18, 85, 70, -17, 4, 0, 0},
+        {4, -18, 82, 73, -17, 4, 0, 0},
+        {4, -18, 80, 75, -17, 4, 0, 0},
+        {4, -18, 78, 78, -18, 4, 0, 0},
+        {4, -17, 75, 80, -18, 4, 0, 0},
+        {4, -17, 73, 82, -18, 4, 0, 0},
+        {4, -17, 70, 85, -18, 4, 0, 0},
+        {4, -17, 68, 87, -18, 4, 0, 0},
+        {4, -16, 65, 89, -18, 4, 0, 0},
+        {4, -16, 63, 91, -18, 4, 0, 0},
+        {4, -16, 60, 94, -18, 4, 0, 0},
+        {3, -15, 58, 96, -18, 4, 0, 0},
+        {4, -15, 55, 98, -18, 4, 0, 0},
+        {3, -14, 52, 100, -17, 4, 0, 0},
+        {3, -14, 50, 102, -17, 4, 0, 0},
+        {3, -13, 47, 104, -17, 4, 0, 0},
+        {3, -13, 45, 106, -17, 4, 0, 0},
+        {3, -12, 42, 108, -16, 3, 0, 0},
+        {3, -11, 40, 109, -16, 3, 0, 0},
+        {3, -11, 37, 111, -15, 3, 0, 0},
+        {2, -10, 35, 113, -15, 3, 0, 0},
+        {3, -10, 32, 114, -14, 3, 0, 0},
+        {2, -9, 29, 116, -13, 3, 0, 0},
+        {2, -8, 27, 117, -13, 3, 0, 0},
+        {2, -8, 25, 119, -12, 2, 0, 0},
+        {2, -7, 22, 120, -11, 2, 0, 0},
+        {1, -6, 20, 121, -10, 2, 0, 0},
+        {1, -6, 18, 122, -9, 2, 0, 0},
+        {1, -5, 15, 123, -8, 2, 0, 0},
+        {1, -4, 13, 124, -7, 1, 0, 0},
+        {1, -4, 11, 125, -6, 1, 0, 0},
+        {1, -3, 8, 126, -5, 1, 0, 0},
+        {1, -2, 6, 126, -4, 1, 0, 0},
+        {0, -1, 4, 127, -3, 1, 0, 0},
+        {0, 0, 2, 127, -1, 0, 0, 0},
+        // [0, 1).
+        {0, 0, 0, 127, 1, 0, 0, 0},
+        {0, 0, -1, 127, 2, 0, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},
+        {0, 1, -5, 127, 6, -2, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},
+        {-1, 2, -7, 126, 11, -4, 2, -1},
+        {-1, 3, -8, 125, 13, -5, 2, -1},
+        {-1, 3, -10, 124, 16, -6, 3, -1},
+        {-1, 4, -11, 123, 18, -7, 3, -1},
+        {-1, 4, -12, 122, 20, -7, 3, -1},
+        {-1, 4, -13, 121, 23, -8, 3, -1},
+        {-2, 5, -14, 120, 25, -9, 4, -1},
+        {-1, 5, -15, 119, 27, -10, 4, -1},
+        {-1, 5, -16, 118, 30, -11, 4, -1},
+        {-2, 6, -17, 116, 33, -12, 5, -1},
+        {-2, 6, -17, 114, 35, -12, 5, -1},
+        {-2, 6, -18, 113, 38, -13, 5, -1},
+        {-2, 7, -19, 111, 41, -14, 6, -2},
+        {-2, 7, -19, 110, 43, -15, 6, -2},
+        {-2, 7, -20, 108, 46, -15, 6, -2},
+        {-2, 7, -20, 106, 49, -16, 6, -2},
+        {-2, 7, -21, 104, 51, -16, 7, -2},
+        {-2, 7, -21, 102, 54, -17, 7, -2},
+        {-2, 8, -21, 100, 56, -18, 7, -2},
+        {-2, 8, -22, 98, 59, -18, 7, -2},
+        {-2, 8, -22, 96, 62, -19, 7, -2},
+        {-2, 8, -22, 94, 64, -19, 7, -2},
+        {-2, 8, -22, 91, 67, -20, 8, -2},
+        {-2, 8, -22, 89, 69, -20, 8, -2},
+        {-2, 8, -22, 87, 72, -21, 8, -2},
+        {-2, 8, -21, 84, 74, -21, 8, -2},
+        {-2, 8, -22, 82, 77, -21, 8, -2},
+        {-2, 8, -21, 79, 79, -21, 8, -2},
+        {-2, 8, -21, 77, 82, -22, 8, -2},
+        {-2, 8, -21, 74, 84, -21, 8, -2},
+        {-2, 8, -21, 72, 87, -22, 8, -2},
+        {-2, 8, -20, 69, 89, -22, 8, -2},
+        {-2, 8, -20, 67, 91, -22, 8, -2},
+        {-2, 7, -19, 64, 94, -22, 8, -2},
+        {-2, 7, -19, 62, 96, -22, 8, -2},
+        {-2, 7, -18, 59, 98, -22, 8, -2},
+        {-2, 7, -18, 56, 100, -21, 8, -2},
+        {-2, 7, -17, 54, 102, -21, 7, -2},
+        {-2, 7, -16, 51, 104, -21, 7, -2},
+        {-2, 6, -16, 49, 106, -20, 7, -2},
+        {-2, 6, -15, 46, 108, -20, 7, -2},
+        {-2, 6, -15, 43, 110, -19, 7, -2},
+        {-2, 6, -14, 41, 111, -19, 7, -2},
+        {-1, 5, -13, 38, 113, -18, 6, -2},
+        {-1, 5, -12, 35, 114, -17, 6, -2},
+        {-1, 5, -12, 33, 116, -17, 6, -2},
+        {-1, 4, -11, 30, 118, -16, 5, -1},
+        {-1, 4, -10, 27, 119, -15, 5, -1},
+        {-1, 4, -9, 25, 120, -14, 5, -2},
+        {-1, 3, -8, 23, 121, -13, 4, -1},
+        {-1, 3, -7, 20, 122, -12, 4, -1},
+        {-1, 3, -7, 18, 123, -11, 4, -1},
+        {-1, 3, -6, 16, 124, -10, 3, -1},
+        {-1, 2, -5, 13, 125, -8, 3, -1},
+        {-1, 2, -4, 11, 126, -7, 2, -1},
+        {0, 1, -3, 8, 126, -6, 2, 0},
+        {0, 1, -2, 6, 127, -5, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},
+        {0, 0, 0, 2, 127, -1, 0, 0},
+        // [1, 2).
+        {0, 0, 0, 1, 127, 0, 0, 0},
+        {0, 0, 0, -1, 127, 2, 0, 0},
+        {0, 0, 1, -3, 127, 4, -1, 0},
+        {0, 0, 1, -4, 126, 6, -2, 1},
+        {0, 0, 1, -5, 126, 8, -3, 1},
+        {0, 0, 1, -6, 125, 11, -4, 1},
+        {0, 0, 1, -7, 124, 13, -4, 1},
+        {0, 0, 2, -8, 123, 15, -5, 1},
+        {0, 0, 2, -9, 122, 18, -6, 1},
+        {0, 0, 2, -10, 121, 20, -6, 1},
+        {0, 0, 2, -11, 120, 22, -7, 2},
+        {0, 0, 2, -12, 119, 25, -8, 2},
+        {0, 0, 3, -13, 117, 27, -8, 2},
+        {0, 0, 3, -13, 116, 29, -9, 2},
+        {0, 0, 3, -14, 114, 32, -10, 3},
+        {0, 0, 3, -15, 113, 35, -10, 2},
+        {0, 0, 3, -15, 111, 37, -11, 3},
+        {0, 0, 3, -16, 109, 40, -11, 3},
+        {0, 0, 3, -16, 108, 42, -12, 3},
+        {0, 0, 4, -17, 106, 45, -13, 3},
+        {0, 0, 4, -17, 104, 47, -13, 3},
+        {0, 0, 4, -17, 102, 50, -14, 3},
+        {0, 0, 4, -17, 100, 52, -14, 3},
+        {0, 0, 4, -18, 98, 55, -15, 4},
+        {0, 0, 4, -18, 96, 58, -15, 3},
+        {0, 0, 4, -18, 94, 60, -16, 4},
+        {0, 0, 4, -18, 91, 63, -16, 4},
+        {0, 0, 4, -18, 89, 65, -16, 4},
+        {0, 0, 4, -18, 87, 68, -17, 4},
+        {0, 0, 4, -18, 85, 70, -17, 4},
+        {0, 0, 4, -18, 82, 73, -17, 4},
+        {0, 0, 4, -18, 80, 75, -17, 4},
+        {0, 0, 4, -18, 78, 78, -18, 4},
+        {0, 0, 4, -17, 75, 80, -18, 4},
+        {0, 0, 4, -17, 73, 82, -18, 4},
+        {0, 0, 4, -17, 70, 85, -18, 4},
+        {0, 0, 4, -17, 68, 87, -18, 4},
+        {0, 0, 4, -16, 65, 89, -18, 4},
+        {0, 0, 4, -16, 63, 91, -18, 4},
+        {0, 0, 4, -16, 60, 94, -18, 4},
+        {0, 0, 3, -15, 58, 96, -18, 4},
+        {0, 0, 4, -15, 55, 98, -18, 4},
+        {0, 0, 3, -14, 52, 100, -17, 4},
+        {0, 0, 3, -14, 50, 102, -17, 4},
+        {0, 0, 3, -13, 47, 104, -17, 4},
+        {0, 0, 3, -13, 45, 106, -17, 4},
+        {0, 0, 3, -12, 42, 108, -16, 3},
+        {0, 0, 3, -11, 40, 109, -16, 3},
+        {0, 0, 3, -11, 37, 111, -15, 3},
+        {0, 0, 2, -10, 35, 113, -15, 3},
+        {0, 0, 3, -10, 32, 114, -14, 3},
+        {0, 0, 2, -9, 29, 116, -13, 3},
+        {0, 0, 2, -8, 27, 117, -13, 3},
+        {0, 0, 2, -8, 25, 119, -12, 2},
+        {0, 0, 2, -7, 22, 120, -11, 2},
+        {0, 0, 1, -6, 20, 121, -10, 2},
+        {0, 0, 1, -6, 18, 122, -9, 2},
+        {0, 0, 1, -5, 15, 123, -8, 2},
+        {0, 0, 1, -4, 13, 124, -7, 1},
+        {0, 0, 1, -4, 11, 125, -6, 1},
+        {0, 0, 1, -3, 8, 126, -5, 1},
+        {0, 0, 1, -2, 6, 126, -4, 1},
+        {0, 0, 0, -1, 4, 127, -3, 1},
+        {0, 0, 0, 0, 2, 127, -1, 0},
+        // dummy, replicate row index 191.
+        {0, 0, 0, 0, 2, 127, -1, 0}};
+
+alignas(16) const int16_t
+    kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+        // [-1, 0).
+        {0, 0, 127, 1, 0, 0, 0, 0},
+        {0, -1, 127, 2, 0, 0, 0, 0},
+        {1, -3, 127, 4, -1, 0, 0, 0},
+        {1, -4, 126, 6, -2, 1, 0, 0},
+        {1, -5, 126, 8, -3, 1, 0, 0},
+        {1, -6, 125, 11, -4, 1, 0, 0},
+        {1, -7, 124, 13, -4, 1, 0, 0},
+        {2, -8, 123, 15, -5, 1, 0, 0},
+        {2, -9, 122, 18, -6, 1, 0, 0},
+        {2, -10, 121, 20, -6, 1, 0, 0},
+        {2, -11, 120, 22, -7, 2, 0, 0},
+        {2, -12, 119, 25, -8, 2, 0, 0},
+        {3, -13, 117, 27, -8, 2, 0, 0},
+        {3, -13, 116, 29, -9, 2, 0, 0},
+        {3, -14, 114, 32, -10, 3, 0, 0},
+        {3, -15, 113, 35, -10, 2, 0, 0},
+        {3, -15, 111, 37, -11, 3, 0, 0},
+        {3, -16, 109, 40, -11, 3, 0, 0},
+        {3, -16, 108, 42, -12, 3, 0, 0},
+        {4, -17, 106, 45, -13, 3, 0, 0},
+        {4, -17, 104, 47, -13, 3, 0, 0},
+        {4, -17, 102, 50, -14, 3, 0, 0},
+        {4, -17, 100, 52, -14, 3, 0, 0},
+        {4, -18, 98, 55, -15, 4, 0, 0},
+        {4, -18, 96, 58, -15, 3, 0, 0},
+        {4, -18, 94, 60, -16, 4, 0, 0},
+        {4, -18, 91, 63, -16, 4, 0, 0},
+        {4, -18, 89, 65, -16, 4, 0, 0},
+        {4, -18, 87, 68, -17, 4, 0, 0},
+        {4, -18, 85, 70, -17, 4, 0, 0},
+        {4, -18, 82, 73, -17, 4, 0, 0},
+        {4, -18, 80, 75, -17, 4, 0, 0},
+        {4, -18, 78, 78, -18, 4, 0, 0},
+        {4, -17, 75, 80, -18, 4, 0, 0},
+        {4, -17, 73, 82, -18, 4, 0, 0},
+        {4, -17, 70, 85, -18, 4, 0, 0},
+        {4, -17, 68, 87, -18, 4, 0, 0},
+        {4, -16, 65, 89, -18, 4, 0, 0},
+        {4, -16, 63, 91, -18, 4, 0, 0},
+        {4, -16, 60, 94, -18, 4, 0, 0},
+        {3, -15, 58, 96, -18, 4, 0, 0},
+        {4, -15, 55, 98, -18, 4, 0, 0},
+        {3, -14, 52, 100, -17, 4, 0, 0},
+        {3, -14, 50, 102, -17, 4, 0, 0},
+        {3, -13, 47, 104, -17, 4, 0, 0},
+        {3, -13, 45, 106, -17, 4, 0, 0},
+        {3, -12, 42, 108, -16, 3, 0, 0},
+        {3, -11, 40, 109, -16, 3, 0, 0},
+        {3, -11, 37, 111, -15, 3, 0, 0},
+        {2, -10, 35, 113, -15, 3, 0, 0},
+        {3, -10, 32, 114, -14, 3, 0, 0},
+        {2, -9, 29, 116, -13, 3, 0, 0},
+        {2, -8, 27, 117, -13, 3, 0, 0},
+        {2, -8, 25, 119, -12, 2, 0, 0},
+        {2, -7, 22, 120, -11, 2, 0, 0},
+        {1, -6, 20, 121, -10, 2, 0, 0},
+        {1, -6, 18, 122, -9, 2, 0, 0},
+        {1, -5, 15, 123, -8, 2, 0, 0},
+        {1, -4, 13, 124, -7, 1, 0, 0},
+        {1, -4, 11, 125, -6, 1, 0, 0},
+        {1, -3, 8, 126, -5, 1, 0, 0},
+        {1, -2, 6, 126, -4, 1, 0, 0},
+        {0, -1, 4, 127, -3, 1, 0, 0},
+        {0, 0, 2, 127, -1, 0, 0, 0},
+        // [0, 1).
+        {0, 0, 0, 127, 1, 0, 0, 0},
+        {0, 0, -1, 127, 2, 0, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},
+        {0, 1, -5, 127, 6, -2, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},
+        {-1, 2, -7, 126, 11, -4, 2, -1},
+        {-1, 3, -8, 125, 13, -5, 2, -1},
+        {-1, 3, -10, 124, 16, -6, 3, -1},
+        {-1, 4, -11, 123, 18, -7, 3, -1},
+        {-1, 4, -12, 122, 20, -7, 3, -1},
+        {-1, 4, -13, 121, 23, -8, 3, -1},
+        {-2, 5, -14, 120, 25, -9, 4, -1},
+        {-1, 5, -15, 119, 27, -10, 4, -1},
+        {-1, 5, -16, 118, 30, -11, 4, -1},
+        {-2, 6, -17, 116, 33, -12, 5, -1},
+        {-2, 6, -17, 114, 35, -12, 5, -1},
+        {-2, 6, -18, 113, 38, -13, 5, -1},
+        {-2, 7, -19, 111, 41, -14, 6, -2},
+        {-2, 7, -19, 110, 43, -15, 6, -2},
+        {-2, 7, -20, 108, 46, -15, 6, -2},
+        {-2, 7, -20, 106, 49, -16, 6, -2},
+        {-2, 7, -21, 104, 51, -16, 7, -2},
+        {-2, 7, -21, 102, 54, -17, 7, -2},
+        {-2, 8, -21, 100, 56, -18, 7, -2},
+        {-2, 8, -22, 98, 59, -18, 7, -2},
+        {-2, 8, -22, 96, 62, -19, 7, -2},
+        {-2, 8, -22, 94, 64, -19, 7, -2},
+        {-2, 8, -22, 91, 67, -20, 8, -2},
+        {-2, 8, -22, 89, 69, -20, 8, -2},
+        {-2, 8, -22, 87, 72, -21, 8, -2},
+        {-2, 8, -21, 84, 74, -21, 8, -2},
+        {-2, 8, -22, 82, 77, -21, 8, -2},
+        {-2, 8, -21, 79, 79, -21, 8, -2},
+        {-2, 8, -21, 77, 82, -22, 8, -2},
+        {-2, 8, -21, 74, 84, -21, 8, -2},
+        {-2, 8, -21, 72, 87, -22, 8, -2},
+        {-2, 8, -20, 69, 89, -22, 8, -2},
+        {-2, 8, -20, 67, 91, -22, 8, -2},
+        {-2, 7, -19, 64, 94, -22, 8, -2},
+        {-2, 7, -19, 62, 96, -22, 8, -2},
+        {-2, 7, -18, 59, 98, -22, 8, -2},
+        {-2, 7, -18, 56, 100, -21, 8, -2},
+        {-2, 7, -17, 54, 102, -21, 7, -2},
+        {-2, 7, -16, 51, 104, -21, 7, -2},
+        {-2, 6, -16, 49, 106, -20, 7, -2},
+        {-2, 6, -15, 46, 108, -20, 7, -2},
+        {-2, 6, -15, 43, 110, -19, 7, -2},
+        {-2, 6, -14, 41, 111, -19, 7, -2},
+        {-1, 5, -13, 38, 113, -18, 6, -2},
+        {-1, 5, -12, 35, 114, -17, 6, -2},
+        {-1, 5, -12, 33, 116, -17, 6, -2},
+        {-1, 4, -11, 30, 118, -16, 5, -1},
+        {-1, 4, -10, 27, 119, -15, 5, -1},
+        {-1, 4, -9, 25, 120, -14, 5, -2},
+        {-1, 3, -8, 23, 121, -13, 4, -1},
+        {-1, 3, -7, 20, 122, -12, 4, -1},
+        {-1, 3, -7, 18, 123, -11, 4, -1},
+        {-1, 3, -6, 16, 124, -10, 3, -1},
+        {-1, 2, -5, 13, 125, -8, 3, -1},
+        {-1, 2, -4, 11, 126, -7, 2, -1},
+        {0, 1, -3, 8, 126, -6, 2, 0},
+        {0, 1, -2, 6, 127, -5, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},
+        {0, 0, 0, 2, 127, -1, 0, 0},
+        // [1, 2).
+        {0, 0, 0, 1, 127, 0, 0, 0},
+        {0, 0, 0, -1, 127, 2, 0, 0},
+        {0, 0, 1, -3, 127, 4, -1, 0},
+        {0, 0, 1, -4, 126, 6, -2, 1},
+        {0, 0, 1, -5, 126, 8, -3, 1},
+        {0, 0, 1, -6, 125, 11, -4, 1},
+        {0, 0, 1, -7, 124, 13, -4, 1},
+        {0, 0, 2, -8, 123, 15, -5, 1},
+        {0, 0, 2, -9, 122, 18, -6, 1},
+        {0, 0, 2, -10, 121, 20, -6, 1},
+        {0, 0, 2, -11, 120, 22, -7, 2},
+        {0, 0, 2, -12, 119, 25, -8, 2},
+        {0, 0, 3, -13, 117, 27, -8, 2},
+        {0, 0, 3, -13, 116, 29, -9, 2},
+        {0, 0, 3, -14, 114, 32, -10, 3},
+        {0, 0, 3, -15, 113, 35, -10, 2},
+        {0, 0, 3, -15, 111, 37, -11, 3},
+        {0, 0, 3, -16, 109, 40, -11, 3},
+        {0, 0, 3, -16, 108, 42, -12, 3},
+        {0, 0, 4, -17, 106, 45, -13, 3},
+        {0, 0, 4, -17, 104, 47, -13, 3},
+        {0, 0, 4, -17, 102, 50, -14, 3},
+        {0, 0, 4, -17, 100, 52, -14, 3},
+        {0, 0, 4, -18, 98, 55, -15, 4},
+        {0, 0, 4, -18, 96, 58, -15, 3},
+        {0, 0, 4, -18, 94, 60, -16, 4},
+        {0, 0, 4, -18, 91, 63, -16, 4},
+        {0, 0, 4, -18, 89, 65, -16, 4},
+        {0, 0, 4, -18, 87, 68, -17, 4},
+        {0, 0, 4, -18, 85, 70, -17, 4},
+        {0, 0, 4, -18, 82, 73, -17, 4},
+        {0, 0, 4, -18, 80, 75, -17, 4},
+        {0, 0, 4, -18, 78, 78, -18, 4},
+        {0, 0, 4, -17, 75, 80, -18, 4},
+        {0, 0, 4, -17, 73, 82, -18, 4},
+        {0, 0, 4, -17, 70, 85, -18, 4},
+        {0, 0, 4, -17, 68, 87, -18, 4},
+        {0, 0, 4, -16, 65, 89, -18, 4},
+        {0, 0, 4, -16, 63, 91, -18, 4},
+        {0, 0, 4, -16, 60, 94, -18, 4},
+        {0, 0, 3, -15, 58, 96, -18, 4},
+        {0, 0, 4, -15, 55, 98, -18, 4},
+        {0, 0, 3, -14, 52, 100, -17, 4},
+        {0, 0, 3, -14, 50, 102, -17, 4},
+        {0, 0, 3, -13, 47, 104, -17, 4},
+        {0, 0, 3, -13, 45, 106, -17, 4},
+        {0, 0, 3, -12, 42, 108, -16, 3},
+        {0, 0, 3, -11, 40, 109, -16, 3},
+        {0, 0, 3, -11, 37, 111, -15, 3},
+        {0, 0, 2, -10, 35, 113, -15, 3},
+        {0, 0, 3, -10, 32, 114, -14, 3},
+        {0, 0, 2, -9, 29, 116, -13, 3},
+        {0, 0, 2, -8, 27, 117, -13, 3},
+        {0, 0, 2, -8, 25, 119, -12, 2},
+        {0, 0, 2, -7, 22, 120, -11, 2},
+        {0, 0, 1, -6, 20, 121, -10, 2},
+        {0, 0, 1, -6, 18, 122, -9, 2},
+        {0, 0, 1, -5, 15, 123, -8, 2},
+        {0, 0, 1, -4, 13, 124, -7, 1},
+        {0, 0, 1, -4, 11, 125, -6, 1},
+        {0, 0, 1, -3, 8, 126, -5, 1},
+        {0, 0, 1, -2, 6, 126, -4, 1},
+        {0, 0, 0, -1, 4, 127, -3, 1},
+        {0, 0, 0, 0, 2, 127, -1, 0},
+        // dummy, replicate row index 191.
+        {0, 0, 0, 0, 2, 127, -1, 0}};
+
+// Every value in |kSubPixelFilters| is even. Divide by 2 to simplify
+// calculations by reducing the range by 1 bit.
+alignas(8) const int8_t kHalfSubPixelFilters[6][16][8] = {
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, -3, 63, 4, -1, 0, 0},
+     {0, 1, -5, 61, 9, -2, 0, 0},
+     {0, 1, -6, 58, 14, -4, 1, 0},
+     {0, 1, -7, 55, 19, -5, 1, 0},
+     {0, 1, -7, 51, 24, -6, 1, 0},
+     {0, 1, -8, 47, 29, -6, 1, 0},
+     {0, 1, -7, 42, 33, -6, 1, 0},
+     {0, 1, -7, 38, 38, -7, 1, 0},
+     {0, 1, -6, 33, 42, -7, 1, 0},
+     {0, 1, -6, 29, 47, -8, 1, 0},
+     {0, 1, -6, 24, 51, -7, 1, 0},
+     {0, 1, -5, 19, 55, -7, 1, 0},
+     {0, 1, -4, 14, 58, -6, 1, 0},
+     {0, 0, -2, 9, 61, -5, 1, 0},
+     {0, 0, -1, 4, 63, -3, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 14, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, -1, 8, 27, 24, 6, 0, 0},
+     {0, -1, 7, 26, 26, 7, -1, 0},
+     {0, 0, 6, 24, 27, 8, -1, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 14, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {-1, 1, -3, 63, 4, -1, 1, 0},
+     {-1, 3, -6, 62, 8, -3, 2, -1},
+     {-1, 4, -9, 60, 13, -5, 3, -1},
+     {-2, 5, -11, 58, 19, -7, 3, -1},
+     {-2, 5, -11, 54, 24, -9, 4, -1},
+     {-2, 5, -12, 50, 30, -10, 4, -1},
+     {-2, 5, -12, 45, 35, -11, 5, -1},
+     {-2, 6, -12, 40, 40, -12, 6, -2},
+     {-1, 5, -11, 35, 45, -12, 5, -2},
+     {-1, 4, -10, 30, 50, -12, 5, -2},
+     {-1, 4, -9, 24, 54, -11, 5, -2},
+     {-1, 3, -7, 19, 58, -11, 5, -2},
+     {-1, 3, -5, 13, 60, -9, 4, -1},
+     {-1, 2, -3, 8, 62, -6, 3, -1},
+     {0, 1, -1, 4, 63, -3, 1, -1}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 0, 60, 4, 0, 0, 0},
+     {0, 0, 0, 56, 8, 0, 0, 0},
+     {0, 0, 0, 52, 12, 0, 0, 0},
+     {0, 0, 0, 48, 16, 0, 0, 0},
+     {0, 0, 0, 44, 20, 0, 0, 0},
+     {0, 0, 0, 40, 24, 0, 0, 0},
+     {0, 0, 0, 36, 28, 0, 0, 0},
+     {0, 0, 0, 32, 32, 0, 0, 0},
+     {0, 0, 0, 28, 36, 0, 0, 0},
+     {0, 0, 0, 24, 40, 0, 0, 0},
+     {0, 0, 0, 20, 44, 0, 0, 0},
+     {0, 0, 0, 16, 48, 0, 0, 0},
+     {0, 0, 0, 12, 52, 0, 0, 0},
+     {0, 0, 0, 8, 56, 0, 0, 0},
+     {0, 0, 0, 4, 60, 0, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, -2, 63, 4, -1, 0, 0},
+     {0, 0, -4, 61, 9, -2, 0, 0},
+     {0, 0, -5, 58, 14, -3, 0, 0},
+     {0, 0, -6, 55, 19, -4, 0, 0},
+     {0, 0, -6, 51, 24, -5, 0, 0},
+     {0, 0, -7, 47, 29, -5, 0, 0},
+     {0, 0, -6, 42, 33, -5, 0, 0},
+     {0, 0, -6, 38, 38, -6, 0, 0},
+     {0, 0, -5, 33, 42, -6, 0, 0},
+     {0, 0, -5, 29, 47, -7, 0, 0},
+     {0, 0, -5, 24, 51, -6, 0, 0},
+     {0, 0, -4, 19, 55, -6, 0, 0},
+     {0, 0, -3, 14, 58, -5, 0, 0},
+     {0, 0, -2, 9, 61, -4, 0, 0},
+     {0, 0, -1, 4, 63, -2, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 15, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 0, 7, 27, 24, 6, 0, 0},
+     {0, 0, 6, 26, 26, 6, 0, 0},
+     {0, 0, 6, 24, 27, 7, 0, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// Absolute values of |kHalfSubPixelFilters|. Used in situations where we know
+// the pattern of the signs and account for it in other ways.
+const uint8_t kAbsHalfSubPixelFilters[6][16][8] = {
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 3, 63, 4, 1, 0, 0},
+     {0, 1, 5, 61, 9, 2, 0, 0},
+     {0, 1, 6, 58, 14, 4, 1, 0},
+     {0, 1, 7, 55, 19, 5, 1, 0},
+     {0, 1, 7, 51, 24, 6, 1, 0},
+     {0, 1, 8, 47, 29, 6, 1, 0},
+     {0, 1, 7, 42, 33, 6, 1, 0},
+     {0, 1, 7, 38, 38, 7, 1, 0},
+     {0, 1, 6, 33, 42, 7, 1, 0},
+     {0, 1, 6, 29, 47, 8, 1, 0},
+     {0, 1, 6, 24, 51, 7, 1, 0},
+     {0, 1, 5, 19, 55, 7, 1, 0},
+     {0, 1, 4, 14, 58, 6, 1, 0},
+     {0, 0, 2, 9, 61, 5, 1, 0},
+     {0, 0, 1, 4, 63, 3, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 14, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 1, 8, 27, 24, 6, 0, 0},
+     {0, 1, 7, 26, 26, 7, 1, 0},
+     {0, 0, 6, 24, 27, 8, 1, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 14, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {1, 1, 3, 63, 4, 1, 1, 0},
+     {1, 3, 6, 62, 8, 3, 2, 1},
+     {1, 4, 9, 60, 13, 5, 3, 1},
+     {2, 5, 11, 58, 19, 7, 3, 1},
+     {2, 5, 11, 54, 24, 9, 4, 1},
+     {2, 5, 12, 50, 30, 10, 4, 1},
+     {2, 5, 12, 45, 35, 11, 5, 1},
+     {2, 6, 12, 40, 40, 12, 6, 2},
+     {1, 5, 11, 35, 45, 12, 5, 2},
+     {1, 4, 10, 30, 50, 12, 5, 2},
+     {1, 4, 9, 24, 54, 11, 5, 2},
+     {1, 3, 7, 19, 58, 11, 5, 2},
+     {1, 3, 5, 13, 60, 9, 4, 1},
+     {1, 2, 3, 8, 62, 6, 3, 1},
+     {0, 1, 1, 4, 63, 3, 1, 1}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 0, 60, 4, 0, 0, 0},
+     {0, 0, 0, 56, 8, 0, 0, 0},
+     {0, 0, 0, 52, 12, 0, 0, 0},
+     {0, 0, 0, 48, 16, 0, 0, 0},
+     {0, 0, 0, 44, 20, 0, 0, 0},
+     {0, 0, 0, 40, 24, 0, 0, 0},
+     {0, 0, 0, 36, 28, 0, 0, 0},
+     {0, 0, 0, 32, 32, 0, 0, 0},
+     {0, 0, 0, 28, 36, 0, 0, 0},
+     {0, 0, 0, 24, 40, 0, 0, 0},
+     {0, 0, 0, 20, 44, 0, 0, 0},
+     {0, 0, 0, 16, 48, 0, 0, 0},
+     {0, 0, 0, 12, 52, 0, 0, 0},
+     {0, 0, 0, 8, 56, 0, 0, 0},
+     {0, 0, 0, 4, 60, 0, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 2, 63, 4, 1, 0, 0},
+     {0, 0, 4, 61, 9, 2, 0, 0},
+     {0, 0, 5, 58, 14, 3, 0, 0},
+     {0, 0, 6, 55, 19, 4, 0, 0},
+     {0, 0, 6, 51, 24, 5, 0, 0},
+     {0, 0, 7, 47, 29, 5, 0, 0},
+     {0, 0, 6, 42, 33, 5, 0, 0},
+     {0, 0, 6, 38, 38, 6, 0, 0},
+     {0, 0, 5, 33, 42, 6, 0, 0},
+     {0, 0, 5, 29, 47, 7, 0, 0},
+     {0, 0, 5, 24, 51, 6, 0, 0},
+     {0, 0, 4, 19, 55, 6, 0, 0},
+     {0, 0, 3, 14, 58, 5, 0, 0},
+     {0, 0, 2, 9, 61, 4, 0, 0},
+     {0, 0, 1, 4, 63, 2, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 15, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 0, 7, 27, 24, 6, 0, 0},
+     {0, 0, 6, 26, 26, 6, 0, 0},
+     {0, 0, 6, 24, 27, 7, 0, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// 9.3 -- Dr_Intra_Derivative[]
+// This is a more compact version of the table from the spec. angle / 2 - 1 is
+// used as the lookup. Note angle / 3 - 1 would work too, but the calculation
+// becomes more costly.
+const int16_t kDirectionalIntraPredictorDerivative[44] = {
+    //              Approx angle
+    1023, 0,     // 3, ...
+    547,         // 6, ...
+    372,  0, 0,  // 9, ...
+    273,         // 14, ...
+    215,  0,     // 17, ...
+    178,         // 20, ...
+    151,  0,     // 23, ... (113 & 203 are base angles)
+    132,         // 26, ...
+    116,  0,     // 29, ...
+    102,  0,     // 32, ...
+    90,          // 36, ...
+    80,   0,     // 39, ...
+    71,          // 42, ...
+    64,   0,     // 45, ... (45 & 135 are base angles)
+    57,          // 48, ...
+    51,   0,     // 51, ...
+    45,   0,     // 54, ...
+    40,          // 58, ...
+    35,   0,     // 61, ...
+    31,          // 64, ...
+    27,   0,     // 67, ... (67 & 157 are base angles)
+    23,          // 70, ...
+    19,   0,     // 73, ...
+    15,   0,     // 76, ...
+    11,   0,     // 81, ...
+    7,           // 84, ...
+    3,           // 87, ...
+};
+
+const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = {
+    {0, 1}, {2, 2}, {3, 3}};
+
+}  // namespace libgav1
diff --git a/src/utils/constants.h b/src/utils/constants.h
new file mode 100644 (file)
index 0000000..8281aad
--- /dev/null
@@ -0,0 +1,815 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CONSTANTS_H_
+#define LIBGAV1_SRC_UTILS_CONSTANTS_H_
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/utils/bit_mask_set.h"
+
+namespace libgav1 {
+
+// Returns the number of elements between begin (inclusive) and end (inclusive).
+constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; }
+
+enum {
+// Maximum number of threads that the library will ever create.
+#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0
+  kMaxThreads = LIBGAV1_MAX_THREADS
+#else
+  kMaxThreads = 128
+#endif
+};  // anonymous enum
+
+enum {
+  // Documentation variables.
+  kBitdepth8 = 8,
+  kBitdepth10 = 10,
+  kBitdepth12 = 12,
+  kInvalidMvValue = -32768,
+  kCdfMaxProbability = 32768,
+  kBlockWidthCount = 5,
+  kMaxSegments = 8,
+  kMinQuantizer = 0,
+  kMinLossyQuantizer = 1,
+  kMaxQuantizer = 255,
+  // Quantizer matrix is used only when level < 15.
+  kNumQuantizerLevelsForQuantizerMatrix = 15,
+  kFrameLfCount = 4,
+  kMaxLoopFilterValue = 63,
+  kNum4x4In64x64 = 256,
+  kMaxAngleDelta = 3,
+  kDirectionalIntraModes = 8,
+  kMaxSuperBlockSizeLog2 = 7,
+  kMinSuperBlockSizeLog2 = 6,
+  kGlobalMotionReadControl = 3,
+  kSuperResScaleNumerator = 8,
+  kBooleanSymbolCount = 2,
+  kRestorationTypeSymbolCount = 3,
+  kSgrProjParamsBits = 4,
+  kSgrProjPrecisionBits = 7,
+  // Precision of a division table (mtable)
+  kSgrProjScaleBits = 20,
+  kSgrProjReciprocalBits = 12,
+  // Core self-guided restoration precision bits.
+  kSgrProjSgrBits = 8,
+  // Precision bits of generated values higher than source before projection.
+  kSgrProjRestoreBits = 4,
+  // Padding on left and right side of a restoration block.
+  // 3 is enough, but padding to 4 is more efficient, and makes the temporary
+  // source buffer 8-pixel aligned.
+  kRestorationHorizontalBorder = 4,
+  // Padding on top and bottom side of a restoration block.
+  kRestorationVerticalBorder = 2,
+  kCdefBorder = 2,             // Padding on each side of a cdef block.
+  kConvolveBorderLeftTop = 3,  // Left/top padding of a convolve block.
+  // Right/bottom padding of a convolve block. This needs to be 4 at minimum,
+  // but was increased to simplify the SIMD loads in
+  // ConvolveCompoundScale2D_NEON() and ConvolveScale2D_NEON().
+  kConvolveBorderRight = 8,
+  kConvolveScaleBorderRight = 15,
+  kConvolveBorderBottom = 4,
+  kSubPixelTaps = 8,
+  kWienerFilterBits = 7,
+  kWienerFilterTaps = 7,
+  kMaxPaletteSize = 8,
+  kMinPaletteSize = 2,
+  kMaxPaletteSquare = 64,
+  kBorderPixels = 64,
+  // The final blending process for film grain needs room to overwrite and read
+  // with SIMD instructions. The maximum overwrite is 7 pixels, but the border
+  // is required to be a multiple of 32 by YuvBuffer::Realloc, so that
+  // subsampled chroma borders are 16-aligned.
+  kBorderPixelsFilmGrain = 32,
+  // These constants are the minimum left, right, top, and bottom border sizes
+  // in pixels as an extension of the frame boundary. The minimum border sizes
+  // are derived from the following requirements:
+  // - Warp_C() may read up to 13 pixels before or after a row.
+  // - Warp_NEON() may read up to 13 pixels before a row. It may read up to 14
+  //   pixels after a row, but the value of the last read pixel is not used.
+  // - Warp_C() and Warp_NEON() may read up to 13 pixels above the top row and
+  //   13 pixels below the bottom row.
+  kMinLeftBorderPixels = 13,
+  kMinRightBorderPixels = 13,
+  kMinTopBorderPixels = 13,
+  kMinBottomBorderPixels = 13,
+  kWarpedModelPrecisionBits = 16,
+  kMaxRefMvStackSize = 8,
+  kMaxLeastSquaresSamples = 8,
+  kMaxTemporalMvCandidates = 19,
+  // The SIMD implementations of motion vection projection functions always
+  // process 2 or 4 elements together, so we pad the corresponding buffers to
+  // size 20.
+  kMaxTemporalMvCandidatesWithPadding = 20,
+  kMaxSuperBlockSizeInPixels = 128,
+  kMaxScaledSuperBlockSizeInPixels = 128 * 2,
+  kMaxSuperBlockSizeSquareInPixels = 128 * 128,
+  kNum4x4InLoopFilterUnit = 16,
+  kNum4x4InLoopRestorationUnit = 16,
+  kProjectionMvClamp = (1 << 14) - 1,  // == 16383
+  kProjectionMvMaxHorizontalOffset = 8,
+  kCdefUnitSize = 64,
+  kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kCdefBorder,
+  kRestorationUnitOffset = 8,
+  // Loop restoration's processing unit size is fixed as 64x64.
+  kRestorationUnitHeight = 64,
+  kRestorationUnitWidth = 256,
+  kRestorationUnitHeightWithBorders =
+      kRestorationUnitHeight + 2 * kRestorationVerticalBorder,
+  kRestorationUnitWidthWithBorders =
+      kRestorationUnitWidth + 2 * kRestorationHorizontalBorder,
+  kSuperResFilterBits = 6,
+  kSuperResFilterShifts = 1 << kSuperResFilterBits,
+  kSuperResFilterTaps = 8,
+  kSuperResScaleBits = 14,
+  kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits,
+  kSuperResScaleMask = (1 << 14) - 1,
+  kSuperResHorizontalBorder = 4,
+  kSuperResVerticalBorder = 1,
+  // The SIMD implementations of superres calculate up to 15 extra upscaled
+  // pixels which will over-read up to 15 downscaled pixels in the end of each
+  // row. Set the padding to 16 for alignment purposes.
+  kSuperResHorizontalPadding = 16,
+  // TODO(chengchen): consider merging these constants:
+  // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7,
+  // They are designed to match AV1 convolution, which increases coeff
+  // values up to 7 bits. We could consider to combine them and use kFilterBits
+  // only.
+  kFilterBits = 7,
+  // Sub pixel is used in AV1 to represent a pixel location that is not at
+  // integer position. Sub pixel is in 1/16 (1 << kSubPixelBits) unit of
+  // integer pixel. Sub pixel values are interpolated using adjacent integer
+  // pixel values. The interpolation is a filtering process.
+  kSubPixelBits = 4,
+  kSubPixelMask = (1 << kSubPixelBits) - 1,
+  // Precision bits when computing inter prediction locations.
+  kScaleSubPixelBits = 10,
+  kWarpParamRoundingBits = 6,
+  // Number of fractional bits of lookup in divisor lookup table.
+  kDivisorLookupBits = 8,
+  // Number of fractional bits of entries in divisor lookup table.
+  kDivisorLookupPrecisionBits = 14,
+  // Number of phases used in warped filtering.
+  kWarpedPixelPrecisionShifts = 1 << 6,
+  kResidualPaddingVertical = 4,
+  kWedgeMaskMasterSize = 64,
+  kMaxFrameDistance = 31,
+  kReferenceFrameScalePrecision = 14,
+  kNumWienerCoefficients = 3,
+  kLoopFilterMaxModeDeltas = 2,
+  kMaxCdefStrengths = 8,
+  kCdefLargeValue = 0x4000,  // Used to indicate where CDEF is not available.
+  kMaxTileColumns = 64,
+  kMaxTileRows = 64,
+  kMaxOperatingPoints = 32,
+  // There can be a maximum of 4 spatial layers and 8 temporal layers.
+  kMaxLayers = 32,
+  // The cache line size should ideally be queried at run time. 64 is a common
+  // cache line size of x86 CPUs. Web searches showed the cache line size of ARM
+  // CPUs is 32 or 64 bytes. So aligning to 64-byte boundary will work for all
+  // CPUs that we care about, even though it is excessive for some ARM
+  // CPUs.
+  //
+  // On Linux, the cache line size can be looked up with the command:
+  //   getconf LEVEL1_DCACHE_LINESIZE
+  kCacheLineSize = 64,
+  // InterRound0, Section 7.11.3.2.
+  kInterRoundBitsHorizontal = 3,  // 8 & 10-bit.
+  kInterRoundBitsHorizontal12bpp = 5,
+  kInterRoundBitsCompoundVertical = 7,  // 8, 10 & 12-bit compound prediction.
+  kInterRoundBitsVertical = 11,         // 8 & 10-bit, single prediction.
+  kInterRoundBitsVertical12bpp = 9,
+  // Offset applied to 10bpp and 12bpp predictors to allow storing them in
+  // uint16_t. Removed before blending.
+  kCompoundOffset = (1 << 14) + (1 << 13),
+};  // anonymous enum
+
+enum FrameType : uint8_t {
+  kFrameKey,
+  kFrameInter,
+  kFrameIntraOnly,
+  kFrameSwitch
+};
+
+enum Plane : uint8_t { kPlaneY, kPlaneU, kPlaneV };
+enum : uint8_t { kMaxPlanesMonochrome = kPlaneY + 1, kMaxPlanes = kPlaneV + 1 };
+
+// The plane types, called luma and chroma in the spec.
+enum PlaneType : uint8_t { kPlaneTypeY, kPlaneTypeUV, kNumPlaneTypes };
+
+enum ReferenceFrameType : int8_t {
+  kReferenceFrameNone = -1,
+  kReferenceFrameIntra,
+  kReferenceFrameLast,
+  kReferenceFrameLast2,
+  kReferenceFrameLast3,
+  kReferenceFrameGolden,
+  kReferenceFrameBackward,
+  kReferenceFrameAlternate2,
+  kReferenceFrameAlternate,
+  kNumReferenceFrameTypes,
+  kNumInterReferenceFrameTypes =
+      EnumRangeLength(kReferenceFrameLast, kReferenceFrameAlternate),
+  kNumForwardReferenceTypes =
+      EnumRangeLength(kReferenceFrameLast, kReferenceFrameGolden),
+  kNumBackwardReferenceTypes =
+      EnumRangeLength(kReferenceFrameBackward, kReferenceFrameAlternate)
+};
+
+enum {
+  // Unidirectional compound reference pairs that are signaled explicitly:
+  // {kReferenceFrameLast, kReferenceFrameLast2},
+  // {kReferenceFrameLast, kReferenceFrameLast3},
+  // {kReferenceFrameLast, kReferenceFrameGolden},
+  // {kReferenceFrameBackward, kReferenceFrameAlternate}
+  kExplicitUnidirectionalCompoundReferences = 4,
+  // Other unidirectional compound reference pairs:
+  // {kReferenceFrameLast2, kReferenceFrameLast3},
+  // {kReferenceFrameLast2, kReferenceFrameGolden},
+  // {kReferenceFrameLast3, kReferenceFrameGolden},
+  // {kReferenceFrameBackward, kReferenceFrameAlternate2},
+  // {kReferenceFrameAlternate2, kReferenceFrameAlternate}
+  kUnidirectionalCompoundReferences =
+      kExplicitUnidirectionalCompoundReferences + 5,
+};  // anonymous enum
+
+enum BlockSize : uint8_t {
+  kBlock4x4,
+  kBlock4x8,
+  kBlock4x16,
+  kBlock8x4,
+  kBlock8x8,
+  kBlock8x16,
+  kBlock8x32,
+  kBlock16x4,
+  kBlock16x8,
+  kBlock16x16,
+  kBlock16x32,
+  kBlock16x64,
+  kBlock32x8,
+  kBlock32x16,
+  kBlock32x32,
+  kBlock32x64,
+  kBlock64x16,
+  kBlock64x32,
+  kBlock64x64,
+  kBlock64x128,
+  kBlock128x64,
+  kBlock128x128,
+  kMaxBlockSizes,
+  kBlockInvalid
+};
+
+//  Partition types.  R: Recursive
+//
+//  None          Horizontal    Vertical      Split
+//  +-------+     +-------+     +---+---+     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  |       |     +-------+     |   |   |     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  +-------+     +-------+     +---+---+     +---+---+
+//
+//  Horizontal    Horizontal    Vertical      Vertical
+//  with top      with bottom   with left     with right
+//  split         split         split         split
+//  +---+---+     +-------+     +---+---+     +---+---+
+//  |   |   |     |       |     |   |   |     |   |   |
+//  +---+---+     +---+---+     +---+   |     |   +---+
+//  |       |     |   |   |     |   |   |     |   |   |
+//  +-------+     +---+---+     +---+---+     +---+---+
+//
+//  Horizontal4   Vertical4
+//  +-----+       +-+-+-+
+//  +-----+       | | | |
+//  +-----+       | | | |
+//  +-----+       +-+-+-+
+enum Partition : uint8_t {
+  kPartitionNone,
+  kPartitionHorizontal,
+  kPartitionVertical,
+  kPartitionSplit,
+  kPartitionHorizontalWithTopSplit,
+  kPartitionHorizontalWithBottomSplit,
+  kPartitionVerticalWithLeftSplit,
+  kPartitionVerticalWithRightSplit,
+  kPartitionHorizontal4,
+  kPartitionVertical4
+};
+enum : uint8_t { kMaxPartitionTypes = kPartitionVertical4 + 1 };
+
+enum PredictionMode : uint8_t {
+  // Intra prediction modes.
+  kPredictionModeDc,
+  kPredictionModeVertical,
+  kPredictionModeHorizontal,
+  kPredictionModeD45,
+  kPredictionModeD135,
+  kPredictionModeD113,
+  kPredictionModeD157,
+  kPredictionModeD203,
+  kPredictionModeD67,
+  kPredictionModeSmooth,
+  kPredictionModeSmoothVertical,
+  kPredictionModeSmoothHorizontal,
+  kPredictionModePaeth,
+  kPredictionModeChromaFromLuma,
+  // Single inter prediction modes.
+  kPredictionModeNearestMv,
+  kPredictionModeNearMv,
+  kPredictionModeGlobalMv,
+  kPredictionModeNewMv,
+  // Compound inter prediction modes.
+  kPredictionModeNearestNearestMv,
+  kPredictionModeNearNearMv,
+  kPredictionModeNearestNewMv,
+  kPredictionModeNewNearestMv,
+  kPredictionModeNearNewMv,
+  kPredictionModeNewNearMv,
+  kPredictionModeGlobalGlobalMv,
+  kPredictionModeNewNewMv,
+  kNumPredictionModes,
+  kNumCompoundInterPredictionModes =
+      EnumRangeLength(kPredictionModeNearestNearestMv, kPredictionModeNewNewMv),
+  kIntraPredictionModesY =
+      EnumRangeLength(kPredictionModeDc, kPredictionModePaeth),
+  kIntraPredictionModesUV =
+      EnumRangeLength(kPredictionModeDc, kPredictionModeChromaFromLuma),
+  kPredictionModeInvalid = 255
+};
+
+enum InterIntraMode : uint8_t {
+  kInterIntraModeDc,
+  kInterIntraModeVertical,
+  kInterIntraModeHorizontal,
+  kInterIntraModeSmooth,
+  kNumInterIntraModes
+};
+
+enum MotionMode : uint8_t {
+  kMotionModeSimple,
+  kMotionModeObmc,  // Overlapped block motion compensation.
+  kMotionModeLocalWarp,
+  kNumMotionModes
+};
+
+enum TxMode : uint8_t {
+  kTxModeOnly4x4,
+  kTxModeLargest,
+  kTxModeSelect,
+  kNumTxModes
+};
+
+// These enums are named as kType1Type2 where Type1 is the transform type for
+// the rows and Type2 is the transform type for the columns.
+enum TransformType : uint8_t {
+  kTransformTypeDctDct,
+  kTransformTypeAdstDct,
+  kTransformTypeDctAdst,
+  kTransformTypeAdstAdst,
+  kTransformTypeFlipadstDct,
+  kTransformTypeDctFlipadst,
+  kTransformTypeFlipadstFlipadst,
+  kTransformTypeAdstFlipadst,
+  kTransformTypeFlipadstAdst,
+  kTransformTypeIdentityIdentity,
+  kTransformTypeIdentityDct,
+  kTransformTypeDctIdentity,
+  kTransformTypeIdentityAdst,
+  kTransformTypeAdstIdentity,
+  kTransformTypeIdentityFlipadst,
+  kTransformTypeFlipadstIdentity,
+  kNumTransformTypes
+};
+
+constexpr BitMaskSet kTransformFlipColumnsMask(kTransformTypeFlipadstDct,
+                                               kTransformTypeFlipadstAdst,
+                                               kTransformTypeFlipadstIdentity,
+                                               kTransformTypeFlipadstFlipadst);
+constexpr BitMaskSet kTransformFlipRowsMask(kTransformTypeDctFlipadst,
+                                            kTransformTypeAdstFlipadst,
+                                            kTransformTypeIdentityFlipadst,
+                                            kTransformTypeFlipadstFlipadst);
+
+enum TransformSize : uint8_t {
+  kTransformSize4x4,
+  kTransformSize4x8,
+  kTransformSize4x16,
+  kTransformSize8x4,
+  kTransformSize8x8,
+  kTransformSize8x16,
+  kTransformSize8x32,
+  kTransformSize16x4,
+  kTransformSize16x8,
+  kTransformSize16x16,
+  kTransformSize16x32,
+  kTransformSize16x64,
+  kTransformSize32x8,
+  kTransformSize32x16,
+  kTransformSize32x32,
+  kTransformSize32x64,
+  kTransformSize64x16,
+  kTransformSize64x32,
+  kTransformSize64x64,
+  kNumTransformSizes
+};
+
+enum TransformSet : uint8_t {
+  // DCT Only (1).
+  kTransformSetDctOnly,
+  // 2D-DCT and 2D-ADST without flip (4) + Identity (1) + 1D Horizontal/Vertical
+  // DCT (2) = Total (7).
+  kTransformSetIntra1,
+  // 2D-DCT and 2D-ADST without flip (4) + Identity (1) = Total (5).
+  kTransformSetIntra2,
+  // All transforms = Total (16).
+  kTransformSetInter1,
+  // 2D-DCT and 2D-ADST with flip (9) + Identity (1) + 1D Horizontal/Vertical
+  // DCT (2) = Total (12).
+  kTransformSetInter2,
+  // DCT (1) + Identity (1) = Total (2).
+  kTransformSetInter3,
+  kNumTransformSets
+};
+
+enum TransformClass : uint8_t {
+  kTransformClass2D,
+  kTransformClassHorizontal,
+  kTransformClassVertical,
+  kNumTransformClasses
+};
+
+enum FilterIntraPredictor : uint8_t {
+  kFilterIntraPredictorDc,
+  kFilterIntraPredictorVertical,
+  kFilterIntraPredictorHorizontal,
+  kFilterIntraPredictorD157,
+  kFilterIntraPredictorPaeth,
+  kNumFilterIntraPredictors
+};
+
+enum ObmcDirection : uint8_t {
+  kObmcDirectionVertical,
+  kObmcDirectionHorizontal,
+  kNumObmcDirections
+};
+
+// In AV1 the name of the filter refers to the direction of filter application.
+// Horizontal refers to the column edge and vertical the row edge.
+enum LoopFilterType : uint8_t {
+  kLoopFilterTypeVertical,
+  kLoopFilterTypeHorizontal,
+  kNumLoopFilterTypes
+};
+
+enum LoopFilterTransformSizeId : uint8_t {
+  kLoopFilterTransformSizeId4x4,
+  kLoopFilterTransformSizeId8x8,
+  kLoopFilterTransformSizeId16x16,
+  kNumLoopFilterTransformSizeIds
+};
+
+enum LoopRestorationType : uint8_t {
+  kLoopRestorationTypeNone,
+  kLoopRestorationTypeSwitchable,
+  kLoopRestorationTypeWiener,
+  kLoopRestorationTypeSgrProj,  // self guided projection filter.
+  kNumLoopRestorationTypes
+};
+
+enum CompoundReferenceType : uint8_t {
+  kCompoundReferenceUnidirectional,
+  kCompoundReferenceBidirectional,
+  kNumCompoundReferenceTypes
+};
+
+enum CompoundPredictionType : uint8_t {
+  kCompoundPredictionTypeWedge,
+  kCompoundPredictionTypeDiffWeighted,
+  kCompoundPredictionTypeAverage,
+  kCompoundPredictionTypeIntra,
+  kCompoundPredictionTypeDistance,
+  kNumCompoundPredictionTypes,
+  // Number of compound prediction types that are explicitly signaled in the
+  // bitstream (in the compound_type syntax element).
+  kNumExplicitCompoundPredictionTypes = 2
+};
+
+enum InterpolationFilter : uint8_t {
+  kInterpolationFilterEightTap,
+  kInterpolationFilterEightTapSmooth,
+  kInterpolationFilterEightTapSharp,
+  kInterpolationFilterBilinear,
+  kInterpolationFilterSwitchable,
+  kNumInterpolationFilters,
+  // Number of interpolation filters that can be explicitly signaled in the
+  // compressed headers (when the uncompressed headers allow switchable
+  // interpolation filters) of the bitstream.
+  kNumExplicitInterpolationFilters = EnumRangeLength(
+      kInterpolationFilterEightTap, kInterpolationFilterEightTapSharp)
+};
+
+enum MvJointType : uint8_t {
+  kMvJointTypeZero,
+  kMvJointTypeHorizontalNonZeroVerticalZero,
+  kMvJointTypeHorizontalZeroVerticalNonZero,
+  kMvJointTypeNonZero,
+  kNumMvJointTypes
+};
+
+enum ObuType : int8_t {
+  kObuInvalid = -1,
+  kObuSequenceHeader = 1,
+  kObuTemporalDelimiter = 2,
+  kObuFrameHeader = 3,
+  kObuTileGroup = 4,
+  kObuMetadata = 5,
+  kObuFrame = 6,
+  kObuRedundantFrameHeader = 7,
+  kObuTileList = 8,
+  kObuPadding = 15,
+};
+
+constexpr BitMaskSet kPredictionModeSmoothMask(kPredictionModeSmooth,
+                                               kPredictionModeSmoothHorizontal,
+                                               kPredictionModeSmoothVertical);
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const BlockSize size) {
+  switch (size) {
+    case kBlock4x4:
+      return "kBlock4x4";
+    case kBlock4x8:
+      return "kBlock4x8";
+    case kBlock4x16:
+      return "kBlock4x16";
+    case kBlock8x4:
+      return "kBlock8x4";
+    case kBlock8x8:
+      return "kBlock8x8";
+    case kBlock8x16:
+      return "kBlock8x16";
+    case kBlock8x32:
+      return "kBlock8x32";
+    case kBlock16x4:
+      return "kBlock16x4";
+    case kBlock16x8:
+      return "kBlock16x8";
+    case kBlock16x16:
+      return "kBlock16x16";
+    case kBlock16x32:
+      return "kBlock16x32";
+    case kBlock16x64:
+      return "kBlock16x64";
+    case kBlock32x8:
+      return "kBlock32x8";
+    case kBlock32x16:
+      return "kBlock32x16";
+    case kBlock32x32:
+      return "kBlock32x32";
+    case kBlock32x64:
+      return "kBlock32x64";
+    case kBlock64x16:
+      return "kBlock64x16";
+    case kBlock64x32:
+      return "kBlock64x32";
+    case kBlock64x64:
+      return "kBlock64x64";
+    case kBlock64x128:
+      return "kBlock64x128";
+    case kBlock128x64:
+      return "kBlock128x64";
+    case kBlock128x128:
+      return "kBlock128x128";
+    case kMaxBlockSizes:
+      return "kMaxBlockSizes";
+    case kBlockInvalid:
+      return "kBlockInvalid";
+  }
+  abort();
+}
+
+inline const char* ToString(const InterIntraMode mode) {
+  switch (mode) {
+    case kInterIntraModeDc:
+      return "kInterIntraModeDc";
+    case kInterIntraModeVertical:
+      return "kInterIntraModeVertical";
+    case kInterIntraModeHorizontal:
+      return "kInterIntraModeHorizontal";
+    case kInterIntraModeSmooth:
+      return "kInterIntraModeSmooth";
+    case kNumInterIntraModes:
+      return "kNumInterIntraModes";
+  }
+  abort();
+}
+
+inline const char* ToString(const ObmcDirection direction) {
+  switch (direction) {
+    case kObmcDirectionVertical:
+      return "kObmcDirectionVertical";
+    case kObmcDirectionHorizontal:
+      return "kObmcDirectionHorizontal";
+    case kNumObmcDirections:
+      return "kNumObmcDirections";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopRestorationType type) {
+  switch (type) {
+    case kLoopRestorationTypeNone:
+      return "kLoopRestorationTypeNone";
+    case kLoopRestorationTypeSwitchable:
+      return "kLoopRestorationTypeSwitchable";
+    case kLoopRestorationTypeWiener:
+      return "kLoopRestorationTypeWiener";
+    case kLoopRestorationTypeSgrProj:
+      return "kLoopRestorationTypeSgrProj";
+    case kNumLoopRestorationTypes:
+      return "kNumLoopRestorationTypes";
+  }
+  abort();
+}
+
+inline const char* ToString(const TransformSize size) {
+  switch (size) {
+    case kTransformSize4x4:
+      return "kTransformSize4x4";
+    case kTransformSize4x8:
+      return "kTransformSize4x8";
+    case kTransformSize4x16:
+      return "kTransformSize4x16";
+    case kTransformSize8x4:
+      return "kTransformSize8x4";
+    case kTransformSize8x8:
+      return "kTransformSize8x8";
+    case kTransformSize8x16:
+      return "kTransformSize8x16";
+    case kTransformSize8x32:
+      return "kTransformSize8x32";
+    case kTransformSize16x4:
+      return "kTransformSize16x4";
+    case kTransformSize16x8:
+      return "kTransformSize16x8";
+    case kTransformSize16x16:
+      return "kTransformSize16x16";
+    case kTransformSize16x32:
+      return "kTransformSize16x32";
+    case kTransformSize16x64:
+      return "kTransformSize16x64";
+    case kTransformSize32x8:
+      return "kTransformSize32x8";
+    case kTransformSize32x16:
+      return "kTransformSize32x16";
+    case kTransformSize32x32:
+      return "kTransformSize32x32";
+    case kTransformSize32x64:
+      return "kTransformSize32x64";
+    case kTransformSize64x16:
+      return "kTransformSize64x16";
+    case kTransformSize64x32:
+      return "kTransformSize64x32";
+    case kTransformSize64x64:
+      return "kTransformSize64x64";
+    case kNumTransformSizes:
+      return "kNumTransformSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const TransformType type) {
+  switch (type) {
+    case kTransformTypeDctDct:
+      return "kTransformTypeDctDct";
+    case kTransformTypeAdstDct:
+      return "kTransformTypeAdstDct";
+    case kTransformTypeDctAdst:
+      return "kTransformTypeDctAdst";
+    case kTransformTypeAdstAdst:
+      return "kTransformTypeAdstAdst";
+    case kTransformTypeFlipadstDct:
+      return "kTransformTypeFlipadstDct";
+    case kTransformTypeDctFlipadst:
+      return "kTransformTypeDctFlipadst";
+    case kTransformTypeFlipadstFlipadst:
+      return "kTransformTypeFlipadstFlipadst";
+    case kTransformTypeAdstFlipadst:
+      return "kTransformTypeAdstFlipadst";
+    case kTransformTypeFlipadstAdst:
+      return "kTransformTypeFlipadstAdst";
+    case kTransformTypeIdentityIdentity:
+      return "kTransformTypeIdentityIdentity";
+    case kTransformTypeIdentityDct:
+      return "kTransformTypeIdentityDct";
+    case kTransformTypeDctIdentity:
+      return "kTransformTypeDctIdentity";
+    case kTransformTypeIdentityAdst:
+      return "kTransformTypeIdentityAdst";
+    case kTransformTypeAdstIdentity:
+      return "kTransformTypeAdstIdentity";
+    case kTransformTypeIdentityFlipadst:
+      return "kTransformTypeIdentityFlipadst";
+    case kTransformTypeFlipadstIdentity:
+      return "kTransformTypeFlipadstIdentity";
+    // case to quiet compiler
+    case kNumTransformTypes:
+      return "kNumTransformTypes";
+  }
+  abort();
+}
+
+//------------------------------------------------------------------------------
+
+extern const uint8_t k4x4WidthLog2[kMaxBlockSizes];
+
+extern const uint8_t k4x4HeightLog2[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksWide[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes];
+
+extern const uint8_t kBlockWidthPixels[kMaxBlockSizes];
+
+extern const uint8_t kBlockHeightPixels[kMaxBlockSizes];
+
+extern const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes];
+
+extern const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2];
+
+extern const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1];
+
+extern const uint8_t kTransformWidth[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight[kNumTransformSizes];
+
+extern const uint8_t kTransformWidth4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformWidthLog2[kNumTransformSizes];
+
+extern const uint8_t kTransformHeightLog2[kNumTransformSizes];
+
+extern const TransformSize kSplitTransformSize[kNumTransformSizes];
+
+// Square transform of size min(w,h).
+extern const TransformSize kTransformSizeSquareMin[kNumTransformSizes];
+
+// Square transform of size max(w,h).
+extern const TransformSize kTransformSizeSquareMax[kNumTransformSizes];
+
+extern const uint8_t kNumTransformTypesInSet[kNumTransformSets];
+
+extern const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4];
+
+extern const int8_t kSgrProjMultiplierMin[2];
+
+extern const int8_t kSgrProjMultiplierMax[2];
+
+extern const int8_t kWienerTapsMin[3];
+
+extern const int8_t kWienerTapsMax[3];
+
+extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts]
+                                           [kSuperResFilterTaps];
+
+// An int8_t version of the kWarpedFilters array.
+// Note: The array could be removed with a performance penalty.
+extern const int8_t kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int16_t kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int8_t kHalfSubPixelFilters[6][16][8];
+
+extern const uint8_t kAbsHalfSubPixelFilters[6][16][8];
+
+extern const int16_t kDirectionalIntraPredictorDerivative[44];
+
+extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_CONSTANTS_H_
diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc
new file mode 100644 (file)
index 0000000..b3c51da
--- /dev/null
@@ -0,0 +1,84 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#include <cpuid.h>
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include <immintrin.h>  // _xgetbv
+#include <intrin.h>
+#endif
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+namespace {
+
+#if defined(__GNUC__)
+void CpuId(int leaf, uint32_t info[4]) {
+  __cpuid_count(leaf, 0 /*ecx=subleaf*/, info[0], info[1], info[2], info[3]);
+}
+
+uint64_t Xgetbv() {
+  const uint32_t ecx = 0;  // ecx specifies the extended control register
+  uint32_t eax;
+  uint32_t edx;
+  __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
+  return (static_cast<uint64_t>(edx) << 32) | eax;
+}
+#else   // _MSC_VER
+void CpuId(int leaf, uint32_t info[4]) {
+  __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
+}
+
+uint64_t Xgetbv() { return _xgetbv(0); }
+#endif  // __GNUC__
+
+}  // namespace
+
+uint32_t GetCpuInfo() {
+  uint32_t info[4];
+
+  // Get the highest feature value cpuid supports
+  CpuId(0, info);
+  const int max_cpuid_value = info[0];
+  if (max_cpuid_value < 1) return 0;
+
+  CpuId(1, info);
+  uint32_t features = 0;
+  if ((info[3] & (1 << 26)) != 0) features |= kSSE2;
+  if ((info[2] & (1 << 9)) != 0) features |= kSSSE3;
+  if ((info[2] & (1 << 19)) != 0) features |= kSSE4_1;
+
+  // Bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((info[2] & (3 << 27)) == (3 << 27)) {
+    // XMM state and YMM state enabled by the OS
+    if ((Xgetbv() & 0x6) == 0x6) {
+      features |= kAVX;
+      if (max_cpuid_value >= 7) {
+        CpuId(7, info);
+        if ((info[1] & (1 << 5)) != 0) features |= kAVX2;
+      }
+    }
+  }
+
+  return features;
+}
+#else
+uint32_t GetCpuInfo() { return 0; }
+#endif  // x86 || x86_64
+
+}  // namespace libgav1
diff --git a/src/utils/cpu.h b/src/utils/cpu.h
new file mode 100644 (file)
index 0000000..aefc2df
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CPU_H_
+#define LIBGAV1_SRC_UTILS_CPU_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__)
+#define LIBGAV1_X86
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#define LIBGAV1_X86
+#define LIBGAV1_X86_MSVC
+#endif
+
+#if defined(LIBGAV1_X86)
+
+#if !defined(LIBGAV1_ENABLE_SSE4_1)
+#define LIBGAV1_ENABLE_SSE4_1 1
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+#if !defined(LIBGAV1_ENABLE_AVX2)
+#define LIBGAV1_ENABLE_AVX2 1
+#endif  // !defined(LIBGAV1_ENABLE_AVX2)
+#else   // !LIBGAV1_ENABLE_SSE4_1
+// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#else  // !LIBGAV1_X86
+
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#undef LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_ENABLE_SSE4_1 0
+
+#endif  // LIBGAV1_X86
+
+// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting
+// (at least) that instruction set. This prevents disabling other instruction
+// sets if the current instruction set isn't a global target, e.g., building
+// *_avx2.cc w/-mavx2, but the remaining files without the flag.
+#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__)
+#define LIBGAV1_TARGETING_AVX2 1
+#else
+#define LIBGAV1_TARGETING_AVX2 0
+#endif
+
+// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there
+// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be
+// enabled in dsp.h to compensate for this.
+#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC))
+#define LIBGAV1_TARGETING_SSE4_1 1
+#else
+#define LIBGAV1_TARGETING_SSE4_1 0
+#endif
+
+#undef LIBGAV1_X86
+
+#if !defined(LIBGAV1_ENABLE_NEON)
+// TODO(jzern): add support for _M_ARM64.
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+    (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENABLE_NEON 0
+#endif
+#endif  // !defined(LIBGAV1_ENABLE_NEON)
+
+enum CpuFeatures : uint8_t {
+  kSSE2 = 1 << 0,
+#define LIBGAV1_CPU_SSE2 (1 << 0)
+  kSSSE3 = 1 << 1,
+#define LIBGAV1_CPU_SSSE3 (1 << 1)
+  kSSE4_1 = 1 << 2,
+#define LIBGAV1_CPU_SSE4_1 (1 << 2)
+  kAVX = 1 << 3,
+#define LIBGAV1_CPU_AVX (1 << 3)
+  kAVX2 = 1 << 4,
+#define LIBGAV1_CPU_AVX2 (1 << 4)
+  kNEON = 1 << 5,
+#define LIBGAV1_CPU_NEON (1 << 5)
+};
+
+// Returns a bit-wise OR of CpuFeatures supported by this platform.
+uint32_t GetCpuInfo();
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_CPU_H_
diff --git a/src/utils/cpu_test.cc b/src/utils/cpu_test.cc
new file mode 100644 (file)
index 0000000..3a01b33
--- /dev/null
@@ -0,0 +1,248 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__linux__)
+#include <unistd.h>
+
+#include <cerrno>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#endif  // defined(__linux__)
+
+#include "gtest/gtest.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+#if defined(__linux__)
+
+// Sample code for getting the number of performance CPU cores. The following
+// sources were consulted:
+// * https://www.kernel.org/doc/html/latest/admin-guide/cputopology.html
+// * cpu-hotplug.txt: CPU hotplug Support in Linux(tm) Kernel
+//   https://lwn.net/Articles/537570/
+// * https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
+// * Android bionic source code of get_nprocs():
+//   libc/bionic/sysinfo.cpp
+// * glibc 2.30 source code of get_nprocs():
+//   sysdeps/unix/sysv/linux/getsysstats.c
+//
+// Tested on:
+// * Asus Nexus 7 2013: Qualcomm Snapdragon 600, 32-bit Android 6.0.1
+//   (Marshmallow). Brings cores online and offline dynamically. (The tablet
+//   has 4 cores. "0", "0-1", "0-2", and "0-3" have all been observed in the
+//   /sys/devices/system/cpu/online file.) This causes the number of cores
+//   currently online to potentially be lower than the number of cores that can
+//   be brought online quickly.
+// * General Mobile 4G: Qualcomm Snapdragon 410, 32-bit Android 7.1.1 (Nougat).
+// * Motorola Moto G5 Plus: Qualcomm Snapdragon 625, 32-bit Android 8.1.0
+//   (Oreo).
+// * Motorola Moto G7 Play: Qualcomm Snapdragon 632, 32-bit Android 9 (Pie).
+//   All 8 cores have the same cpuinfo_max_freq (1804800), but there are two
+//   values of cpuinfo_min_freq: cores 0-3 have 614400 and cores 4-7 have
+//   633600. We would need to check cpuinfo_min_freq to differentiate the two
+//   kinds of cores (Qualcomm Kryo 250 Gold and Qualcomm Kryo 250 Silver).
+// * Pixel 2 XL: Qualcomm Snapdragon 835, 64-bit Android 9 (Pie).
+// * Pixel 3: Qualcomm Snapdragon 845, 64-bit Android 9 (Pie).
+// * Pixel 3a: Qualcomm Snapdragon 670, 64-bit Android 9 (Pie).
+// * Samsung Galaxy S6: Samsung Exynos 7 Octa (7420), 64-bit Android 7.0
+//   (Nougat).
+// * Samsung Galaxy S8+ (SM-G955FD): Samsung Exynos 8895, 64-bit Android 8.0.0.
+//
+// Note: The sample code needs to use the 'long' type because it is the return
+// type of the Standard C Library function strtol(). The ClangTidy warnings are
+// suppressed with NOLINT(google-runtime-int) comments.
+
+// Returns the number of online processor cores.
+int GetNumberOfProcessorsOnline() {
+  // See https://developer.android.com/ndk/guides/cpu-features.
+  long num_cpus = sysconf(_SC_NPROCESSORS_ONLN);  // NOLINT(google-runtime-int)
+  if (num_cpus < 0) {
+    LIBGAV1_DLOG(ERROR, "sysconf(_SC_NPROCESSORS_ONLN) failed: %s.",
+                 strerror(errno));
+    return 0;
+  }
+  // It is safe to cast num_cpus to int. sysconf(_SC_NPROCESSORS_ONLN) returns
+  // the return value of get_nprocs(), which is an int.
+  return static_cast<int>(num_cpus);
+}
+
+// These CPUs support heterogeneous multiprocessing.
+#if defined(__arm__) || defined(__aarch64__)
+
+// A helper function used by GetNumberOfPerformanceCoresOnline().
+//
+// Returns the cpuinfo_max_freq value (in kHz) of the given CPU. Returns 0 on
+// failure.
+long GetCpuinfoMaxFreq(int cpu_index) {  // NOLINT(google-runtime-int)
+  char buffer[128];
+  const int rv = snprintf(
+      buffer, sizeof(buffer),
+      "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpu_index);
+  if (rv < 0 || rv >= sizeof(buffer)) {
+    LIBGAV1_DLOG(ERROR, "snprintf failed, or |buffer| is too small.");
+    return 0;
+  }
+  FILE* file = fopen(buffer, "r");
+  if (file == nullptr) {
+    LIBGAV1_DLOG(ERROR, "fopen(\"%s\", \"r\") failed: %s.", buffer,
+                 strerror(errno));
+    return 0;
+  }
+  char* const str = fgets(buffer, sizeof(buffer), file);
+  fclose(file);
+  if (str == nullptr) {
+    LIBGAV1_DLOG(ERROR, "fgets failed.");
+    return 0;
+  }
+  const long freq = strtol(str, nullptr, 10);  // NOLINT(google-runtime-int)
+  if (freq <= 0 || freq == LONG_MAX) {
+    LIBGAV1_DLOG(ERROR,
+                 "No conversion can be performed, or the converted value is "
+                 "invalid: %ld.",
+                 freq);
+    return 0;
+  }
+  return freq;
+}
+
+// Returns the number of performance CPU cores that are online. The number of
+// efficiency CPU cores is subtracted from the total number of CPU cores. Uses
+// cpuinfo_max_freq to determine whether a CPU is a performance core or an
+// efficiency core.
+//
+// This function is not perfect. For example, the Snapdragon 632 SoC used in
+// Motorola Moto G7 has performance and efficiency cores with the same
+// cpuinfo_max_freq but different cpuinfo_min_freq. This function fails to
+// differentiate the two kinds of cores and reports all the cores as
+// performance cores.
+int GetNumberOfPerformanceCoresOnline() {
+  // Get the online CPU list. Some examples of the online CPU list are:
+  //   "0-7"
+  //   "0"
+  //   "0-1,2,3,4-7"
+  char online[512];
+  FILE* file = fopen("/sys/devices/system/cpu/online", "r");
+  if (file == nullptr) {
+    LIBGAV1_DLOG(ERROR,
+                 "fopen(\"/sys/devices/system/cpu/online\", \"r\") failed: %s.",
+                 strerror(errno));
+    return 0;
+  }
+  char* const str = fgets(online, sizeof(online), file);
+  fclose(file);
+  file = nullptr;
+  if (str == nullptr) {
+    LIBGAV1_DLOG(ERROR, "fgets failed.");
+    return 0;
+  }
+  LIBGAV1_DLOG(INFO, "The online CPU list is %s", online);
+
+  // Count the number of the slowest CPUs. Some SoCs such as Snapdragon 855
+  // have performance cores with different max frequencies, so only the slowest
+  // CPUs are efficiency cores. If we count the number of the fastest CPUs, we
+  // will fail to count the second fastest performance cores.
+  long slowest_cpu_freq = LONG_MAX;  // NOLINT(google-runtime-int)
+  int num_slowest_cpus = 0;
+  int num_cpus = 0;
+  const char* cp = online;
+  int range_begin = -1;
+  while (true) {
+    char* str_end;
+    const int cpu = static_cast<int>(strtol(cp, &str_end, 10));
+    if (str_end == cp) {
+      break;
+    }
+    cp = str_end;
+    if (*cp == '-') {
+      range_begin = cpu;
+    } else {
+      if (range_begin == -1) {
+        range_begin = cpu;
+      }
+
+      num_cpus += cpu - range_begin + 1;
+      for (int i = range_begin; i <= cpu; ++i) {
+        const long freq = GetCpuinfoMaxFreq(i);  // NOLINT(google-runtime-int)
+        if (freq <= 0) {
+          return 0;
+        }
+        LIBGAV1_DLOG(INFO, "cpu%d max frequency is %ld kHz.", i, freq);
+        if (freq < slowest_cpu_freq) {
+          slowest_cpu_freq = freq;
+          num_slowest_cpus = 0;
+        }
+        if (freq == slowest_cpu_freq) {
+          ++num_slowest_cpus;
+        }
+      }
+
+      range_begin = -1;
+    }
+    if (*cp == '\0') {
+      break;
+    }
+    ++cp;
+  }
+
+  LIBGAV1_DLOG(INFO, "There are %d CPU cores.", num_cpus);
+  LIBGAV1_DLOG(INFO,
+               "%d CPU cores are the slowest, with max frequency %ld kHz.",
+               num_slowest_cpus, slowest_cpu_freq);
+  // If there are faster CPU cores than the slowest CPU cores, exclude the
+  // slowest CPU cores.
+  if (num_slowest_cpus < num_cpus) {
+    num_cpus -= num_slowest_cpus;
+  }
+  return num_cpus;
+}
+
+#else
+
+// Assume symmetric multiprocessing.
+int GetNumberOfPerformanceCoresOnline() {
+  return GetNumberOfProcessorsOnline();
+}
+
+#endif
+
+#endif  // defined(__linux__)
+
+/*
+  Run this test with logging enabled on an Android device:
+  64-bit Android:
+    tests/run_android_test.sh --test cpu --enable_asserts
+  32-bit Android:
+    tests/run_android_test.sh --test cpu --arch arm \
+        --enable_asserts
+*/
+TEST(CpuTest, GetNumberOfPerformanceCoresOnline) {
+#if defined(__linux__)
+  const int num_cpus = GetNumberOfProcessorsOnline();
+  ASSERT_NE(num_cpus, 0);
+  LIBGAV1_DLOG(INFO, "There are %d cores online.", num_cpus);
+  const int num_performance_cpus = GetNumberOfPerformanceCoresOnline();
+  ASSERT_NE(num_performance_cpus, 0);
+  LIBGAV1_DLOG(INFO, "There are %d performance cores online.",
+               num_performance_cpus);
+#endif  // defined(__linux__)
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h
new file mode 100644 (file)
index 0000000..0694980
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+#define LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+
+#include <cstddef>
+#include <memory>
+#include <new>
+
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+template <typename T>
+class DynamicBuffer {
+ public:
+  T* get() { return buffer_.get(); }
+  const T* get() const { return buffer_.get(); }
+
+  // Resizes the buffer so that it can hold at least |size| elements. Existing
+  // contents will be destroyed when resizing to a larger size.
+  //
+  // Returns true on success. If Resize() returns false, then subsequent calls
+  // to get() will return nullptr.
+  bool Resize(size_t size) {
+    if (size <= size_) return true;
+    buffer_.reset(new (std::nothrow) T[size]);
+    if (buffer_ == nullptr) {
+      size_ = 0;
+      return false;
+    }
+    size_ = size;
+    return true;
+  }
+
+  size_t size() const { return size_; }
+
+ private:
+  std::unique_ptr<T[]> buffer_;
+  size_t size_ = 0;
+};
+
+template <typename T, int alignment>
+class AlignedDynamicBuffer {
+ public:
+  T* get() { return buffer_.get(); }
+
+  // Resizes the buffer so that it can hold at least |size| elements. Existing
+  // contents will be destroyed when resizing to a larger size.
+  //
+  // Returns true on success. If Resize() returns false, then subsequent calls
+  // to get() will return nullptr.
+  bool Resize(size_t size) {
+    if (size <= size_) return true;
+    buffer_ = MakeAlignedUniquePtr<T>(alignment, size);
+    if (buffer_ == nullptr) {
+      size_ = 0;
+      return false;
+    }
+    size_ = size;
+    return true;
+  }
+
+ private:
+  AlignedUniquePtr<T> buffer_;
+  size_t size_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
diff --git a/src/utils/entropy_decoder.cc b/src/utils/entropy_decoder.cc
new file mode 100644 (file)
index 0000000..3d97e69
--- /dev/null
@@ -0,0 +1,1120 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+    (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr uint32_t kReadBitMask = ~255;
+constexpr int kCdfPrecision = 6;
+constexpr int kMinimumProbabilityPerSymbol = 4;
+
+// This function computes the "cur" variable as specified inside the do-while
+// loop in Section 8.2.6 of the spec. This function is monotonically
+// decreasing as the values of index increases (note that the |cdf| array is
+// sorted in decreasing order).
+uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
+                  int index, int symbol_count) {
+  return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
+         (kMinimumProbabilityPerSymbol * (symbol_count - index));
+}
+
+void UpdateCdf(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol_count,
+               const int symbol) {
+  const uint16_t count = cdf[symbol_count];
+  // rate is computed in the spec as:
+  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+  // In this case cdf[N] is |count|.
+  // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
+  // symbol_count > 3. So the equation becomes:
+  //  4 + (count > 15) + (count > 31) + (symbol_count > 3).
+  // Note that the largest value for count is 32 (it is not incremented beyond
+  // 32). So using that information:
+  //  count >> 4 is 0 for count from 0 to 15.
+  //  count >> 4 is 1 for count from 16 to 31.
+  //  count >> 4 is 2 for count == 31.
+  // Now, the equation becomes:
+  //  4 + (count >> 4) + (symbol_count > 3).
+  // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+  // with bitwise or:
+  //  (4 | (count >> 4)) + (symbol_count > 3).
+  // but using addition will allow the compiler to eliminate an operation when
+  // symbol_count is known and this function is inlined.
+  const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
+  // Hints for further optimizations:
+  //
+  // 1. clang can vectorize this for loop with width 4, even though the loop
+  // contains an if-else statement. Therefore, it may be advantageous to use
+  // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
+  // (a multiple of 4 that's not too small).
+  //
+  // 2. The for loop can be rewritten in the following form, which would enable
+  // clang to vectorize the loop with width 8:
+  //
+  //   const int rounding = (1 << rate) - 1;
+  //   for (int i = 0; i < symbol_count - 1; ++i) {
+  //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+  //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+  //   }
+  //
+  // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
+  // integer arithmetic. The result of the unsigned subtraction is cast to a
+  // signed integer and right-shifted. This requires the right shift of a
+  // signed integer be an arithmetic shift, which is true for clang, gcc, and
+  // Visual C++.
+  assert(symbol_count - 1 > 0);
+  int i = 0;
+  do {
+    if (i < symbol) {
+      cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+    } else {
+      cdf[i] -= cdf[i] >> rate;
+    }
+  } while (++i < symbol_count - 1);
+  cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+}
+
+// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
+// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
+// SIMD instruction sets if available.
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+// The UpdateCdf() method contains the following for loop:
+//
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     if (i < symbol) {
+//       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+//     } else {
+//       cdf[i] -= cdf[i] >> rate;
+//     }
+//   }
+//
+// It can be rewritten in the following two forms, which are amenable to SIMD
+// implementations:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+//     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+//   }
+//
+// or:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
+//     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
+//   }
+//
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
+//
+// The cdf array has symbol_count + 1 elements. The first symbol_count elements
+// are the CDF. The last element is a count that is initialized to 0 and may
+// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
+// cdf[symbol_count - 1] is always 0, the for loop does not update
+// cdf[symbol_count - 1]. However, it would be correct to have the for loop
+// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
+// for loop would take the else branch when i is symbol_count - 1:
+//      cdf[i] -= cdf[i] >> rate;
+// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
+// after the update. The ARM NEON implementations take advantage of this in the
+// following two cases:
+// 1. When symbol_count is 8 or 16, the vectorized code updates the first
+//    symbol_count elements in the array.
+// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
+//    the cdf array. Since an invalid CDF value is written into cdf[7], the
+//    count in cdf[7] needs to be fixed up after the vectorized code.
+
+void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x4_t cdf_vec = vld1_u16(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+  const uint16x4_t index = vcreate_u16(0x0003000200010000);
+  const uint16x4_t symbol_vec = vdup_n_u16(symbol);
+  const uint16x4_t mask = vcge_u16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
+  const int16x4_t negative_rate = vdup_n_s16(-rate);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = vadd_u16(cdf_offset, delta);
+  vst1_u16(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                        vcreate_u16(0x0007000600050004));
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+  const uint16x8_t delta =
+      vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+    const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
+                                          vcreate_u16(0x0009000800070006));
+    const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+    const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+    const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+    const uint16x8_t delta =
+        vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+    cdf_vec = vaddq_u16(cdf_offset, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
+    cdf_vec = vsubq_u16(cdf_vec, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x8_t cdf_vec0 = vld1q_u16(cdf);
+  uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec0 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec0);
+
+  index = vcombine_u16(vcreate_u16(0x0007000600050004),
+                       vcreate_u16(0x000b000a00090008));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+  cdf_offset = vsubq_u16(cdf_vec1, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec1 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+
+  cdf_vec = vld1q_u16(cdf + 8);
+  index = vcombine_u16(vcreate_u16(0x000b000a00090008),
+                       vcreate_u16(0x000f000e000d000c));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  cdf_offset = vsubq_u16(cdf_vec, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 8, cdf_vec);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec = LoadLo8(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
+  // i >= symbol.
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  __m128i cdf_vec = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec = LoadUnaligned16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const __m128i cdf_max_probability =
+        _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+    const __m128i index =
+        _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+    const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+    const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+    const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+    const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+    const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+    const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_add_epi16(cdf_offset, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadLo8(cdf);
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec0);
+
+  const __m128i index1 =
+      _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec0);
+
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
+  const __m128i index1 =
+      _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 8, cdf_vec1);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 5, symbol);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 7, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 8, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 9, symbol);
+}
+
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 11, symbol);
+}
+
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 13, symbol);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 16, symbol);
+}
+
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+inline EntropyDecoder::WindowSize HostToBigEndian(
+    const EntropyDecoder::WindowSize x) {
+  static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+  return x;
+#endif
+#elif defined(_WIN32)
+  // Note Windows targets are assumed to be little endian.
+  return static_cast<EntropyDecoder::WindowSize>(
+      (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+                       : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif  // defined(__GNUC__)
+}
+
+}  // namespace
+
+#if !LIBGAV1_CXX17
+constexpr int EntropyDecoder::kWindowSize;  // static.
+#endif
+
+EntropyDecoder::EntropyDecoder(const uint8_t* data, size_t size,
+                               bool allow_update_cdf)
+    : data_(data),
+      data_end_(data + size),
+      data_memcpy_end_((size >= sizeof(WindowSize))
+                           ? data + size - sizeof(WindowSize) + 1
+                           : data),
+      allow_update_cdf_(allow_update_cdf),
+      values_in_range_(kCdfMaxProbability) {
+  if (data_ < data_memcpy_end_) {
+    // This is a simplified version of PopulateBits() which loads 8 extra bits
+    // and skips the unnecessary shifts of value and window_diff_.
+    WindowSize value;
+    memcpy(&value, data_, sizeof(value));
+    data_ += sizeof(value);
+    window_diff_ = HostToBigEndian(value) ^ -1;
+    // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+    // used to restore the most significant 0 bit that would be present after
+    // PopulateBits() when we extract the first symbol value.
+    // As shown in Section 8.2.2 Initialization process for symbol decoder,
+    // which uses a fixed offset to read the symbol values, the most
+    // significant bit is always 0:
+    //   The variable numBits is set equal to Min( sz * 8, 15).
+    //   The variable buf is read using the f(numBits) parsing process.
+    //   The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+    //   The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+    bits_ = kWindowSize - 15;
+    return;
+  }
+  window_diff_ = 0;
+  bits_ = -15;
+  PopulateBits();
+}
+
+// This is similar to the ReadSymbol() implementation but it is optimized based
+// on the following facts:
+//   * The probability is fixed at half. So some multiplications can be replaced
+//     with bit operations.
+//   * Symbol count is fixed at 2.
+int EntropyDecoder::ReadBit() {
+  const uint32_t curr =
+      ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  int bit = 1;
+  if (symbol_value >= curr) {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+    bit = 0;
+  } else {
+    values_in_range_ = curr;
+  }
+  NormalizeRange();
+  return bit;
+}
+
+int64_t EntropyDecoder::ReadLiteral(int num_bits) {
+  assert(num_bits <= 32);
+  assert(num_bits > 0);
+  uint32_t literal = 0;
+  int bit = num_bits - 1;
+  do {
+    // ARM can combine a shift operation with a constant number of bits with
+    // some other operations, such as the OR operation.
+    // Here is an ARM disassembly example:
+    // orr w1, w0, w1, lsl #1
+    // which left shifts register w1 by 1 bit and OR the shift result with
+    // register w0.
+    // The next 2 lines are equivalent to:
+    // literal |= static_cast<uint32_t>(ReadBit()) << bit;
+    literal <<= 1;
+    literal |= static_cast<uint32_t>(ReadBit());
+  } while (--bit >= 0);
+  return literal;
+}
+
+int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf,
+                               int symbol_count) {
+  const int symbol = ReadSymbolImpl(cdf, symbol_count);
+  if (allow_update_cdf_) {
+    UpdateCdf(cdf, symbol_count, symbol);
+  }
+  return symbol;
+}
+
+bool EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT cdf) {
+  assert(cdf[1] == 0);
+  const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
+  if (allow_update_cdf_) {
+    const uint16_t count = cdf[2];
+    // rate is computed in the spec as:
+    //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+    // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
+    //  4 + (count > 15) + (count > 31)
+    // Note that the largest value for count is 32 (it is not incremented beyond
+    // 32). So using that information:
+    //  count >> 4 is 0 for count from 0 to 15.
+    //  count >> 4 is 1 for count from 16 to 31.
+    //  count >> 4 is 2 for count == 32.
+    // Now, the equation becomes:
+    //  4 + (count >> 4).
+    // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
+    // with bitwise or. So the final equation is:
+    //  4 | (count >> 4).
+    const int rate = 4 | (count >> 4);
+    if (symbol) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+    }
+    cdf[2] += static_cast<uint16_t>(count < 32);
+  }
+  return symbol;
+}
+
+bool EntropyDecoder::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
+  return ReadSymbolImpl(cdf) != 0;
+}
+
+template <int symbol_count>
+int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf) {
+  static_assert(symbol_count >= 3 && symbol_count <= 16, "");
+  if (symbol_count == 3 || symbol_count == 4) {
+    return ReadSymbol3Or4(cdf, symbol_count);
+  }
+  int symbol;
+  if (symbol_count == 8) {
+    symbol = ReadSymbolImpl8(cdf);
+  } else if (symbol_count <= 13) {
+    symbol = ReadSymbolImpl(cdf, symbol_count);
+  } else {
+    symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
+  }
+  if (allow_update_cdf_) {
+    if (symbol_count == 5) {
+      UpdateCdf5(cdf, symbol);
+    } else if (symbol_count == 7) {
+      UpdateCdf7(cdf, symbol);
+    } else if (symbol_count == 8) {
+      UpdateCdf8(cdf, symbol);
+    } else if (symbol_count == 9) {
+      UpdateCdf9(cdf, symbol);
+    } else if (symbol_count == 11) {
+      UpdateCdf11(cdf, symbol);
+    } else if (symbol_count == 13) {
+      UpdateCdf13(cdf, symbol);
+    } else if (symbol_count == 16) {
+      UpdateCdf16(cdf, symbol);
+    } else {
+      UpdateCdf(cdf, symbol_count, symbol);
+    }
+  }
+  return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImpl(const uint16_t* LIBGAV1_RESTRICT const cdf,
+                                   int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  --symbol_count;
+  uint32_t curr = values_in_range_;
+  int symbol = -1;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  do {
+    prev = curr;
+    curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    delta -= kMinimumProbabilityPerSymbol;
+  } while (symbol_value < curr);
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImplBinarySearch(
+    const uint16_t* LIBGAV1_RESTRICT const cdf, int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  assert(symbol_count > 1 && symbol_count <= 16);
+  --symbol_count;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
+  // search to do this. Let |symbol| be the index of the first |cdf| array
+  // entry whose scaled cdf value is less than or equal to |symbol_value|. The
+  // binary search maintains the invariant:
+  //   low <= symbol <= high + 1
+  // and terminates when low == high + 1.
+  int low = 0;
+  int high = symbol_count - 1;
+  // The binary search maintains the invariants that |prev| is the scaled cdf
+  // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
+  // convention, the scaled cdf value for -1 is values_in_range_.) When the
+  // binary search terminates, |prev| is the scaled cdf value for symbol - 1
+  // and |curr| is the scaled cdf value for |symbol|.
+  uint32_t prev = values_in_range_;
+  uint32_t curr = 0;
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+  do {
+    const int mid = DivideBy2(low + high);
+    const uint32_t scaled_cdf =
+        ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
+    if (symbol_value < scaled_cdf) {
+      low = mid + 1;
+      prev = scaled_cdf;
+    } else {
+      high = mid - 1;
+      curr = scaled_cdf;
+    }
+  } while (low <= high);
+  assert(low == high + 1);
+  // At this point, |low| is the symbol that has been decoded.
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return low;
+}
+
+int EntropyDecoder::ReadSymbolImpl(uint16_t cdf) {
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  const uint32_t curr =
+      (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+      kMinimumProbabilityPerSymbol;
+  const int symbol = static_cast<int>(symbol_value < curr);
+  if (symbol == 1) {
+    values_in_range_ = curr;
+  } else {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  }
+  NormalizeRange();
+  return symbol;
+}
+
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
+// calls inlined.
+int EntropyDecoder::ReadSymbol3Or4(uint16_t* LIBGAV1_RESTRICT const cdf,
+                                   const int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
+  // array.
+  //
+  // The original code is:
+  //
+  //  int symbol = -1;
+  //  do {
+  //    prev = curr;
+  //    curr =
+  //        ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //        + delta;
+  //    delta -= kMinimumProbabilityPerSymbol;
+  //  } while (symbol_value < curr);
+  //  if (allow_update_cdf_) {
+  //    UpdateCdf(cdf, [3,4], symbol);
+  //  }
+  //
+  // The do-while loop is unrolled with three or four iterations, and the
+  // UpdateCdf call is inlined and merged into the iterations.
+  int symbol = 0;
+  // Iteration 0.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 0.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+        // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+        // NEON code is slower. Consider using the C version if __arm__ is
+        // defined.
+        // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+        // Samsung Galaxy S8+ (SM-G955FD).
+        uint16x4_t cdf_vec = vld1_u16(cdf);
+        const int16x4_t negative_rate = vdup_n_s16(-rate);
+        const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+        cdf_vec = vsub_u16(cdf_vec, delta);
+        vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        __m128i cdf_vec = LoadLo8(cdf);
+        const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+        cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+        StoreLo8(cdf, cdf_vec);
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+        cdf[2] -= cdf[2] >> rate;
+#endif
+      } else {  // symbol_count == 3.
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+      }
+    }
+    goto found;
+  }
+  ++symbol;
+  delta -= kMinimumProbabilityPerSymbol;
+  // Iteration 1.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 1.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+      if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
+    }
+    goto found;
+  }
+  ++symbol;
+  if (symbol_count == 4) {
+    delta -= kMinimumProbabilityPerSymbol;
+    // Iteration 2.
+    prev = curr;
+    curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    if (symbol_value >= curr) {
+      // symbol == 2.
+      if (allow_update_cdf_) {
+        // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+        const uint16_t count = cdf[4];
+        cdf[4] += static_cast<uint16_t>(count < 32);
+        const int rate = (count >> 4) + 5;
+        cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+        cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+        cdf[2] -= cdf[2] >> rate;
+      }
+      goto found;
+    }
+    ++symbol;
+  }
+  // |delta| is 0 for the last iteration.
+  // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
+  prev = curr;
+  // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == [2,3].
+  if (allow_update_cdf_) {
+    // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+    const uint16_t count = cdf[symbol_count];
+    cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+    const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+    if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+      // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+      // code is a tiny bit slower. Consider using the C version if __arm__ is
+      // defined.
+      uint16x4_t cdf_vec = vld1_u16(cdf);
+      const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+      const int16x4_t diff =
+          vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+      const int16x4_t negative_rate = vdup_n_s16(-rate);
+      const uint16x4_t delta =
+          vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+      cdf_vec = vadd_u16(cdf_vec, delta);
+      vst1_u16(cdf, cdf_vec);
+      cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      __m128i cdf_vec = LoadLo8(cdf);
+      const __m128i cdf_max_probability =
+          _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+      const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+      const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+      cdf_vec = _mm_add_epi16(cdf_vec, delta);
+      StoreLo8(cdf, cdf_vec);
+      cdf[3] = 0;
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+      cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+#endif
+    } else {  // symbol_count == 3.
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    }
+  }
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImpl8(
+    const uint16_t* LIBGAV1_RESTRICT const cdf) {
+  assert(cdf[7] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * 7;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  //
+  // The original code is:
+  //
+  // int symbol = -1;
+  // do {
+  //   prev = curr;
+  //   curr =
+  //       (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //       + delta;
+  //   delta -= kMinimumProbabilityPerSymbol;
+  // } while (symbol_value < curr);
+  //
+  // The do-while loop is unrolled with eight iterations.
+  int symbol = 0;
+
+#define READ_SYMBOL_ITERATION                                                \
+  prev = curr;                                                               \
+  curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
+         delta;                                                              \
+  if (symbol_value >= curr) goto found;                                      \
+  ++symbol;                                                                  \
+  delta -= kMinimumProbabilityPerSymbol
+
+  READ_SYMBOL_ITERATION;  // Iteration 0.
+  READ_SYMBOL_ITERATION;  // Iteration 1.
+  READ_SYMBOL_ITERATION;  // Iteration 2.
+  READ_SYMBOL_ITERATION;  // Iteration 3.
+  READ_SYMBOL_ITERATION;  // Iteration 4.
+  READ_SYMBOL_ITERATION;  // Iteration 5.
+
+  // The last two iterations can be simplified, so they don't use the
+  // READ_SYMBOL_ITERATION macro.
+#undef READ_SYMBOL_ITERATION
+
+  // Iteration 6.
+  prev = curr;
+  curr =
+      (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) goto found;  // symbol == 6.
+  ++symbol;
+  // |delta| is 0 for the last iteration.
+  // Iteration 7.
+  prev = curr;
+  // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == 7.
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+void EntropyDecoder::PopulateBits() {
+  constexpr int kMaxCachedBits = kWindowSize - 16;
+#if defined(__aarch64__)
+  // Fast path: read eight bytes and add the first six bytes to window_diff_.
+  // This fast path makes the following assumptions.
+  // 1. We assume that unaligned load of uint64_t is fast.
+  // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
+  //    bytes depending on the value of bits_. This fast path always reads 6
+  //    bytes, which results in more calls to PopulateBits(). We assume that
+  //    making more calls to a faster PopulateBits() is overall a win.
+  // NOTE: Although this fast path could also be used on x86_64, it hurts
+  // performance (measured on Lenovo ThinkStation P920 running Linux). (The
+  // reason is still unknown.) Therefore this fast path is only used on arm64.
+  static_assert(kWindowSize == 64, "");
+  if (data_ < data_memcpy_end_) {
+    uint64_t value;
+    // arm64 supports unaligned loads, so this memcpy call is compiled to a
+    // single ldr instruction.
+    memcpy(&value, data_, sizeof(value));
+    data_ += kMaxCachedBits >> 3;
+    value = HostToBigEndian(value) ^ -1;
+    value >>= kWindowSize - kMaxCachedBits;
+    window_diff_ = value | (window_diff_ << kMaxCachedBits);
+    bits_ += kMaxCachedBits;
+    return;
+  }
+#endif
+
+  const uint8_t* data = data_;
+  int bits = bits_;
+  WindowSize window_diff = window_diff_;
+
+  int count = kWindowSize - 9 - (bits + 15);
+  // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
+  // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
+  // iterations when WindowSize is 64 bits. So it is not profitable to
+  // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
+  // the fast path above is not compiled.
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable)
+#endif
+  for (; count >= 0 && data < data_end_; count -= 8) {
+    const uint8_t value = *data++ ^ -1;
+    window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
+    bits += 8;
+  }
+  assert(bits <= kMaxCachedBits);
+  if (data == data_end_) {
+    // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+    window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+    bits = kMaxCachedBits;
+  }
+
+  data_ = data;
+  bits_ = bits;
+  window_diff_ = window_diff;
+}
+
+void EntropyDecoder::NormalizeRange() {
+  const int bits_used = 15 ^ FloorLog2(values_in_range_);
+  bits_ -= bits_used;
+  values_in_range_ <<= bits_used;
+  if (bits_ < 0) PopulateBits();
+}
+
+// Explicit instantiations.
+template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
+
+}  // namespace libgav1
diff --git a/src/utils/entropy_decoder.h b/src/utils/entropy_decoder.h
new file mode 100644 (file)
index 0000000..8eeaef4
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+#define LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+class EntropyDecoder final : public BitReader {
+ public:
+  // WindowSize must be an unsigned integer type with at least 32 bits. Use the
+  // largest type with fast arithmetic. size_t should meet these requirements.
+  using WindowSize = size_t;
+
+  EntropyDecoder(const uint8_t* data, size_t size, bool allow_update_cdf);
+  ~EntropyDecoder() override = default;
+
+  // Move only.
+  EntropyDecoder(EntropyDecoder&& rhs) noexcept;
+  EntropyDecoder& operator=(EntropyDecoder&& rhs) noexcept;
+
+  int ReadBit() override;
+  int64_t ReadLiteral(int num_bits) override;
+  // ReadSymbol() calls for which the |symbol_count| is only known at runtime
+  // will use this variant.
+  int ReadSymbol(uint16_t* cdf, int symbol_count);
+  // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean
+  // symbols) will use this variant.
+  bool ReadSymbol(uint16_t* cdf);
+  bool ReadSymbolWithoutCdfUpdate(uint16_t cdf);
+  // Use either linear search or binary search for decoding the symbol depending
+  // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known
+  // at compile time will use this variant.
+  template <int symbol_count>
+  int ReadSymbol(uint16_t* cdf);
+
+ private:
+  static constexpr int kWindowSize = static_cast<int>(sizeof(WindowSize)) * 8;
+  static_assert(kWindowSize >= 32, "");
+
+  // Reads a symbol using the |cdf| table which contains the probabilities of
+  // each symbol. On a high level, this function does the following:
+  //   1) Scale the |cdf| values.
+  //   2) Find the index in the |cdf| array where the scaled CDF value crosses
+  //   the modified |window_diff_| threshold.
+  //   3) That index is the symbol that has been decoded.
+  //   4) Update |window_diff_| and |values_in_range_| based on the symbol that
+  //   has been decoded.
+  inline int ReadSymbolImpl(const uint16_t* cdf, int symbol_count);
+  // Similar to ReadSymbolImpl but it uses binary search to perform step 2 in
+  // the comment above. As of now, this function is called when |symbol_count|
+  // is greater than or equal to 14.
+  inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count);
+  // Specialized implementation of ReadSymbolImpl based on the fact that
+  // symbol_count == 2.
+  inline int ReadSymbolImpl(uint16_t cdf);
+  // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N.
+  LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count);
+  // ReadSymbolImplN is a specialization of ReadSymbolImpl for
+  // symbol_count == N.
+  LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf);
+  inline void PopulateBits();
+  // Normalizes the range so that 32768 <= |values_in_range_| < 65536. Also
+  // calls PopulateBits() if necessary.
+  inline void NormalizeRange();
+
+  const uint8_t* data_;
+  const uint8_t* const data_end_;
+  // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes
+  // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the
+  // constructor, not PopulateBits().
+  const uint8_t* const data_memcpy_end_;
+  const bool allow_update_cdf_;
+  // Number of cached bits of data in the current value.
+  int bits_;
+  // Number of values in the current range. Declared as uint32_t for better
+  // performance but only the lower 16 bits are used.
+  uint32_t values_in_range_;
+  // The difference between the high end of the current range and the coded
+  // value minus 1. The 16 bits above |bits_| of this variable are used to
+  // decode the next symbol. It is filled in whenever |bits_| is less than 0.
+  // Note this implementation differs from the spec as it trades the need to
+  // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(),
+  // which occurs less frequently.
+  WindowSize window_diff_;
+};
+
+extern template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
diff --git a/src/utils/entropy_decoder_test.cc b/src/utils/entropy_decoder_test.cc
new file mode 100644 (file)
index 0000000..9d23088
--- /dev/null
@@ -0,0 +1,1259 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cstdint>
+#include <cstdio>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+#include "src/utils/entropy_decoder_test_data.inc"
+
+class EntropyDecoderTest : public testing::Test {
+ protected:
+  // If compile_time is true, tests
+  //     bool EntropyDecoder::ReadSymbol(uint16_t* cdf).
+  // Otherwise, tests
+  //     int EntropyDecoder::ReadSymbol(uint16_t* cdf, int symbol_count)
+  // with symbol_count=2.
+  template <bool compile_time>
+  void TestReadSymbolBoolean(int num_runs);
+
+  // For N = 3..16 (except 15):
+  //     template <bool compile_time>
+  //     void TestReadSymbolN(int num_runs);
+  //
+  // If compile_time is true, tests
+  //     int EntropyDecoder::ReadSymbol<N>(uint16_t* const cdf).
+  // Otherwise, tests
+  //     int EntropyDecoder::ReadSymbol(uint16_t* cdf, int symbol_count)
+  // with symbol_count=N.
+  //
+  // NOTE: symbol_count=15 is not tested because AV1 does not use it.
+  template <bool compile_time>
+  void TestReadSymbol3(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol4(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol5(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol6(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol7(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol8(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol9(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol10(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol11(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol12(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol13(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol14(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol16(int num_runs);
+};
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbolBoolean(int num_runs) {
+  static constexpr int kSymbols[4][4] = {{0, 0, 1, 1},  //
+                                         {0, 1, 1, 0},  //
+                                         {1, 0, 1, 0},  //
+                                         {1, 0, 0, 1}};
+  absl::Duration elapsed_time;
+  bool symbols[1024 * 4 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbolBoolean,
+                          kNumBytesTestReadSymbolBoolean,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][3] = {
+        {16384, 0, 0},
+        {32768 - 8386, 0, 0},
+        {32768 - 24312, 0, 0},
+        {16384, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 2) != 0;
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbolBooleanCompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbolBoolean(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 4; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol3(int num_runs) {
+  static constexpr int kSymbols[6][4] = {{0, 2, 1, 2},  //
+                                         {1, 1, 2, 1},  //
+                                         {2, 0, 0, 0},  //
+                                         {0, 2, 0, 2},  //
+                                         {1, 2, 1, 0},  //
+                                         {2, 1, 1, 0}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 6 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol3, kNumBytesTestReadSymbol3,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][4] = {
+        // pdf: 1/3, 1/3, 1/3
+        {32768 - 10923, 32768 - 21845, 0, 0},
+        // pdf: 1/6, 2/6, 3/6
+        {32768 - 5461, 32768 - 16384, 0, 0},
+        // pdf: 2/6, 3/6, 1/6
+        {32768 - 10923, 32768 - 27307, 0, 0},
+        // pdf: 3/6, 1/6, 2/6
+        {32768 - 16384, 32768 - 21845, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 6; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<3>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 3);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol3CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol3(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 6; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol4(int num_runs) {
+  static constexpr int kSymbols[8][4] = {{0, 0, 3, 3},  //
+                                         {0, 0, 2, 2},  //
+                                         {1, 1, 0, 0},  //
+                                         {1, 2, 1, 1},  //
+                                         {2, 2, 3, 2},  //
+                                         {2, 3, 2, 1},  //
+                                         {3, 3, 0, 0},  //
+                                         {3, 3, 1, 1}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 8 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol4, kNumBytesTestReadSymbol4,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][5] = {
+        // pdf: 1/4, 1/4, 1/4, 1/4
+        {32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0},
+        // pdf: 2/8, 1/8, 2/8, 3/8
+        {32768 - 8192, 32768 - 12288, 32768 - 20480, 0, 0},
+        // pdf: 1/4, 1/4, 1/4, 1/4
+        {32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0},
+        // pdf: 2/8, 3/8, 2/8, 1/8
+        {32768 - 8192, 32768 - 20480, 32768 - 28672, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 8; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<4>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 4);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol4CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol4(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 8; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol5(int num_runs) {
+  static constexpr int kSymbols[10][4] = {{0, 0, 4, 4},  //
+                                          {0, 1, 3, 3},  //
+                                          {1, 2, 2, 2},  //
+                                          {1, 3, 1, 1},  //
+                                          {2, 4, 0, 0},  //
+                                          {2, 0, 4, 3},  //
+                                          {3, 1, 3, 2},  //
+                                          {3, 2, 2, 1},  //
+                                          {4, 3, 1, 2},  //
+                                          {4, 0, 4, 2}};
+  absl::Duration elapsed_time;
+  int symbols[320 * 10 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol5, kNumBytesTestReadSymbol5,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][6] = {
+        // pdf: 1/5, 1/5, 1/5, 1/5, 1/5
+        {32768 - 6554, 32768 - 13107, 32768 - 19661, 32768 - 26214, 0, 0},
+        // pdf: 3/10, 2/10, 2/10, 2/10, 1/10
+        {32768 - 9830, 32768 - 16384, 32768 - 22938, 32768 - 29491, 0, 0},
+        // pdf: 1/10, 2/10, 2/10, 2/10, 3/10
+        {32768 - 3277, 32768 - 9830, 32768 - 16384, 32768 - 22938, 0, 0},
+        // pdf: 1/10, 2/10, 4/10, 2/10, 1/10
+        {32768 - 3277, 32768 - 9830, 32768 - 22938, 32768 - 29491, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 320; ++i) {
+      for (int j = 0; j < 10; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<5>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 5);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol5CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol5(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 320; ++i) {
+    for (int j = 0; j < 10; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol6(int num_runs) {
+  static constexpr int kSymbols[12][4] = {{0, 0, 5, 5},  //
+                                          {0, 1, 4, 4},  //
+                                          {1, 2, 3, 3},  //
+                                          {1, 3, 2, 2},  //
+                                          {2, 4, 1, 1},  //
+                                          {2, 5, 0, 0},  //
+                                          {3, 0, 5, 4},  //
+                                          {3, 1, 4, 3},  //
+                                          {4, 2, 3, 2},  //
+                                          {4, 3, 2, 1},  //
+                                          {5, 4, 1, 3},  //
+                                          {5, 0, 5, 2}};
+  absl::Duration elapsed_time;
+  int symbols[256 * 12 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol6, kNumBytesTestReadSymbol6,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][7] = {
+        // pmf: 1/6, 1/6, 1/6, 1/6, 1/6, 1/6
+        {32768 - 5461, 32768 - 10923, 32768 - 16384, 32768 - 21845,
+         32768 - 27307, 0, 0},
+        // pmf: 3/12, 2/12, 2/12, 2/12, 2/12, 1/12
+        {32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576,
+         32768 - 30037, 0, 0},
+        // pmf: 1/12, 2/12, 2/12, 2/12, 2/12, 3/12
+        {32768 - 2731, 32768 - 8192, 32768 - 13653, 32768 - 19115,
+         32768 - 24576, 0, 0},
+        // pmf: 1/12, 2/12, 3/12, 3/12, 2/12, 1/12
+        {32768 - 2731, 32768 - 8192, 32768 - 16384, 32768 - 24576,
+         32768 - 30037, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 256; ++i) {
+      for (int j = 0; j < 12; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<6>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 6);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol6CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol6(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 12; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol7(int num_runs) {
+  static constexpr int kSymbols[14][4] = {{0, 4, 6, 3},  //
+                                          {1, 5, 5, 2},  //
+                                          {2, 6, 4, 1},  //
+                                          {3, 0, 3, 0},  //
+                                          {4, 1, 2, 6},  //
+                                          {5, 2, 1, 5},  //
+                                          {6, 3, 0, 4},  //
+                                          {0, 0, 6, 5},  //
+                                          {2, 1, 4, 3},  //
+                                          {4, 3, 6, 1},  //
+                                          {6, 5, 2, 4},  //
+                                          {1, 0, 5, 2},  //
+                                          {3, 2, 3, 2},  //
+                                          {5, 4, 5, 3}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 14 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol7, kNumBytesTestReadSymbol7,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][8] = {
+        // pdf: 1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7
+        {32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+         32768 - 23406, 32768 - 28087, 0, 0},
+        // pdf: 3/14, 2/14, 2/14, 2/14, 2/14, 2/14, 1/14
+        {32768 - 7022, 32768 - 11703, 32768 - 16384, 32768 - 21065,
+         32768 - 25746, 32768 - 30427, 0, 0},
+        // pdf: 1/14, 1/14, 2/14, 2/14, 2/14, 3/14, 3/14
+        {32768 - 2341, 32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+         32768 - 25746, 0, 0},
+        // pdf: 1/14, 2/14, 3/14, 3/14, 2/14, 2/14, 1/14
+        {32768 - 2341, 32768 - 7022, 32768 - 14043, 32768 - 21065,
+         32768 - 25746, 32768 - 30427, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 14; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<7>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 7);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol7CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol7(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 14; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol8(int num_runs) {
+  static constexpr int kSymbols[16][4] = {{0, 4, 7, 3},  //
+                                          {1, 5, 6, 2},  //
+                                          {2, 6, 5, 1},  //
+                                          {3, 7, 4, 0},  //
+                                          {4, 0, 3, 7},  //
+                                          {5, 1, 2, 6},  //
+                                          {6, 2, 1, 5},  //
+                                          {7, 3, 0, 4},  //
+                                          {0, 0, 6, 5},  //
+                                          {2, 1, 4, 3},  //
+                                          {4, 3, 6, 4},  //
+                                          {6, 5, 2, 2},  //
+                                          {1, 0, 7, 3},  //
+                                          {3, 2, 5, 5},  //
+                                          {5, 4, 7, 2},  //
+                                          {7, 6, 3, 4}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 16 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol8, kNumBytesTestReadSymbol8,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][9] = {
+        // pdf: 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8
+        {32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+         32768 - 20480, 32768 - 24576, 32768 - 28672, 0, 0},
+        // pdf: 3/16, 2/16, 2/16, 2/16, 2/16, 2/16, 2/16, 1/16
+        {32768 - 6144, 32768 - 10240, 32768 - 14336, 32768 - 18432,
+         32768 - 22528, 32768 - 26624, 32768 - 30720, 0, 0},
+        // pdf: 1/16, 1/16, 2/16, 2/16, 2/16, 2/16, 3/16, 3/16
+        {32768 - 2048, 32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+         32768 - 20480, 32768 - 26624, 0, 0},
+        // pdf: 1/16, 1/16, 3/16, 3/16, 3/16, 3/16, 1/16, 1/16
+        {32768 - 2048, 32768 - 4096, 32768 - 10240, 32768 - 16384,
+         32768 - 22528, 32768 - 28672, 32768 - 30720, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 16; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<8>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 8);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol8CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol8(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 16; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol9(int num_runs) {
+  static constexpr int kSymbols[18][4] = {{0, 4, 8, 3},  //
+                                          {1, 5, 7, 2},  //
+                                          {2, 6, 6, 1},  //
+                                          {3, 7, 5, 0},  //
+                                          {4, 8, 4, 8},  //
+                                          {5, 0, 3, 7},  //
+                                          {6, 1, 2, 6},  //
+                                          {7, 2, 1, 5},  //
+                                          {8, 3, 0, 4},  //
+                                          {0, 0, 8, 7},  //
+                                          {2, 1, 6, 5},  //
+                                          {4, 3, 4, 3},  //
+                                          {6, 5, 2, 1},  //
+                                          {8, 7, 7, 6},  //
+                                          {1, 0, 5, 4},  //
+                                          {3, 2, 3, 2},  //
+                                          {5, 4, 1, 4},  //
+                                          {7, 6, 8, 4}};
+  absl::Duration elapsed_time;
+  int symbols[128 * 18 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol9, kNumBytesTestReadSymbol9,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][10] = {
+        // pmf: 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9
+        {32768 - 3641, 32768 - 7282, 32768 - 10923, 32768 - 14564,
+         32768 - 18204, 32768 - 21845, 32768 - 25486, 32768 - 29127, 0, 0},
+        // pmf: 3/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 1/18
+        {32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+         32768 - 20025, 32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0},
+        // pmf: 1/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 3/18
+        {32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+         32768 - 20025, 32768 - 23666, 32768 - 27307, 0, 0},
+        // pmf: 1/18, 2/18, 2/18, 2/18, 4/18, 2/18, 2/18, 2/18, 1/18
+        {32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 20025,
+         32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 128; ++i) {
+      for (int j = 0; j < 18; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<9>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 9);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol9CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol9(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 128; ++i) {
+    for (int j = 0; j < 18; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol10(int num_runs) {
+  static constexpr int kSymbols[20][4] = {{0, 5, 9, 4},  //
+                                          {1, 6, 8, 3},  //
+                                          {2, 7, 7, 2},  //
+                                          {3, 8, 6, 1},  //
+                                          {4, 9, 5, 0},  //
+                                          {5, 0, 4, 9},  //
+                                          {6, 1, 3, 8},  //
+                                          {7, 2, 2, 7},  //
+                                          {8, 3, 1, 6},  //
+                                          {9, 4, 0, 5},  //
+                                          {0, 0, 9, 7},  //
+                                          {2, 1, 8, 5},  //
+                                          {4, 3, 6, 3},  //
+                                          {6, 5, 4, 1},  //
+                                          {8, 7, 2, 8},  //
+                                          {1, 0, 9, 6},  //
+                                          {3, 2, 7, 4},  //
+                                          {5, 4, 5, 2},  //
+                                          {7, 6, 3, 5},  //
+                                          {9, 8, 1, 4}};
+  absl::Duration elapsed_time;
+  int symbols[96 * 20 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol10, kNumBytesTestReadSymbol10,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][11] = {
+        // pmf: 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10
+        {32768 - 3277, 32768 - 6554, 32768 - 9830, 32768 - 13107, 32768 - 16384,
+         32768 - 19661, 32768 - 22938, 32768 - 26214, 32768 - 29491, 0, 0},
+        // pmf: 3/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 1/20
+        {32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+         32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853,
+         32768 - 31130, 0, 0},
+        // pmf: 1/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 3/20
+        {32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+         32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853, 0, 0},
+        // pmf: 1/20, 2/20, 2/20, 2/20, 3/20, 3/20, 2/20, 2/20, 2/20, 1/20
+        {32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 16384,
+         32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 96; ++i) {
+      for (int j = 0; j < 20; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<10>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 10);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol10CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol10(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 96; ++i) {
+    for (int j = 0; j < 20; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol11(int num_runs) {
+  static constexpr int kSymbols[22][4] = {{0, 6, 10, 5},   //
+                                          {1, 7, 9, 4},    //
+                                          {2, 8, 8, 3},    //
+                                          {3, 9, 7, 2},    //
+                                          {4, 10, 6, 1},   //
+                                          {5, 0, 5, 0},    //
+                                          {6, 1, 4, 10},   //
+                                          {7, 2, 3, 9},    //
+                                          {8, 3, 2, 8},    //
+                                          {9, 4, 1, 7},    //
+                                          {10, 5, 0, 6},   //
+                                          {0, 0, 10, 9},   //
+                                          {2, 1, 8, 7},    //
+                                          {4, 3, 6, 5},    //
+                                          {6, 5, 4, 3},    //
+                                          {8, 7, 2, 1},    //
+                                          {10, 9, 10, 8},  //
+                                          {1, 0, 9, 6},    //
+                                          {3, 2, 7, 4},    //
+                                          {5, 4, 5, 2},    //
+                                          {7, 6, 3, 5},    //
+                                          {9, 8, 1, 5}};
+  absl::Duration elapsed_time;
+  int symbols[96 * 22 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol11, kNumBytesTestReadSymbol11,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][12] = {
+        // pmf: 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11
+        {32768 - 2979, 32768 - 5958, 32768 - 8937, 32768 - 11916, 32768 - 14895,
+         32768 - 17873, 32768 - 20852, 32768 - 23831, 32768 - 26810,
+         32768 - 29789, 0, 0},
+        // pmf: 3/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 1/22
+        {32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+         32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+         32768 - 28300, 32768 - 31279, 0, 0},
+        // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 3/22
+        {32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+         32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+         32768 - 28300, 0, 0},
+        // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 4/22, 2/22, 2/22, 2/22, 2/22, 1/22
+        {32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+         32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+         32768 - 31279, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 96; ++i) {
+      for (int j = 0; j < 22; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<11>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 11);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol11CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol11(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 96; ++i) {
+    for (int j = 0; j < 22; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol12(int num_runs) {
+  static constexpr int kSymbols[24][4] = {{0, 6, 11, 5},   //
+                                          {1, 7, 10, 4},   //
+                                          {2, 8, 9, 3},    //
+                                          {3, 9, 8, 2},    //
+                                          {4, 10, 7, 1},   //
+                                          {5, 11, 6, 0},   //
+                                          {6, 0, 5, 11},   //
+                                          {7, 1, 4, 10},   //
+                                          {8, 2, 3, 9},    //
+                                          {9, 3, 2, 8},    //
+                                          {10, 4, 1, 7},   //
+                                          {11, 5, 0, 6},   //
+                                          {0, 0, 11, 9},   //
+                                          {2, 1, 10, 7},   //
+                                          {4, 3, 8, 5},    //
+                                          {6, 5, 6, 3},    //
+                                          {8, 7, 4, 1},    //
+                                          {10, 9, 2, 10},  //
+                                          {1, 0, 11, 8},   //
+                                          {3, 2, 9, 6},    //
+                                          {5, 4, 7, 4},    //
+                                          {7, 6, 5, 2},    //
+                                          {9, 8, 3, 6},    //
+                                          {11, 10, 1, 5}};
+  absl::Duration elapsed_time;
+  int symbols[80 * 24 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol12, kNumBytesTestReadSymbol12,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][13] = {
+        // pmf: 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12,
+        // 1/12,
+        // 1/12
+        {32768 - 2731, 32768 - 5461, 32768 - 8192, 32768 - 10923, 32768 - 13653,
+         32768 - 16384, 32768 - 19115, 32768 - 21845, 32768 - 24576,
+         32768 - 27307, 32768 - 30037, 0, 0},
+        // pmf: 3/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+        // 2/24,
+        // 1/24
+        {32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288, 32768 - 15019,
+         32768 - 17749, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+         32768 - 28672, 32768 - 31403, 0, 0},
+        // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+        // 2/24,
+        // 3/24
+        {32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+         32768 - 15019, 32768 - 17749, 32768 - 20480, 32768 - 23211,
+         32768 - 25941, 32768 - 28672, 0, 0},
+        // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 3/24, 3/24, 2/24, 2/24, 2/24,
+        // 2/24,
+        // 1/24
+        {32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+         32768 - 16384, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+         32768 - 28672, 32768 - 31403, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 80; ++i) {
+      for (int j = 0; j < 24; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<12>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 12);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol12CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol12(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 80; ++i) {
+    for (int j = 0; j < 24; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol13(int num_runs) {
+  static constexpr int kSymbols[26][4] = {{0, 6, 12, 5},     //
+                                          {1, 7, 11, 4},     //
+                                          {2, 8, 10, 3},     //
+                                          {3, 9, 9, 2},      //
+                                          {4, 10, 8, 1},     //
+                                          {5, 11, 7, 0},     //
+                                          {6, 12, 6, 12},    //
+                                          {7, 0, 5, 11},     //
+                                          {8, 1, 4, 10},     //
+                                          {9, 2, 3, 9},      //
+                                          {10, 3, 2, 8},     //
+                                          {11, 4, 1, 7},     //
+                                          {12, 5, 0, 6},     //
+                                          {0, 0, 12, 11},    //
+                                          {2, 1, 10, 9},     //
+                                          {4, 3, 8, 7},      //
+                                          {6, 5, 6, 5},      //
+                                          {8, 7, 4, 3},      //
+                                          {10, 9, 2, 1},     //
+                                          {12, 11, 12, 10},  //
+                                          {1, 0, 11, 8},     //
+                                          {3, 2, 9, 6},      //
+                                          {5, 4, 7, 4},      //
+                                          {7, 6, 5, 2},      //
+                                          {9, 8, 3, 6},      //
+                                          {11, 10, 1, 6}};
+  absl::Duration elapsed_time;
+  int symbols[64 * 26 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol13, kNumBytesTestReadSymbol13,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][14] = {
+        // pmf: 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13,
+        // 1/13, 1/13, 1/13
+        {32768 - 2521, 32768 - 5041, 32768 - 7562, 32768 - 10082, 32768 - 12603,
+         32768 - 15124, 32768 - 17644, 32768 - 20165, 32768 - 22686,
+         32768 - 25206, 32768 - 27727, 32768 - 30247, 0, 0},
+        // pmf: 3/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+        // 2/26, 2/26, 1/26
+        {32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343, 32768 - 13863,
+         32768 - 16384, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+         32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0},
+        // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+        // 2/26, 2/26, 3/26
+        {32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+         32768 - 13863, 32768 - 16384, 32768 - 18905, 32768 - 21425,
+         32768 - 23946, 32768 - 26466, 32768 - 28987, 0, 0},
+        // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 4/26, 2/26, 2/26, 2/26,
+        // 2/26, 2/26, 1/26
+        {32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+         32768 - 13863, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+         32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 26; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<13>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 13);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol13CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol13(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 64; ++i) {
+    for (int j = 0; j < 26; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol14(int num_runs) {
+  static constexpr int kSymbols[28][4] = {{0, 7, 13, 6},    //
+                                          {1, 8, 12, 5},    //
+                                          {2, 9, 11, 4},    //
+                                          {3, 10, 10, 3},   //
+                                          {4, 11, 9, 2},    //
+                                          {5, 12, 8, 1},    //
+                                          {6, 13, 7, 0},    //
+                                          {7, 0, 6, 13},    //
+                                          {8, 1, 5, 12},    //
+                                          {9, 2, 4, 11},    //
+                                          {10, 3, 3, 10},   //
+                                          {11, 4, 2, 9},    //
+                                          {12, 5, 1, 8},    //
+                                          {13, 6, 0, 7},    //
+                                          {0, 0, 13, 11},   //
+                                          {2, 1, 12, 9},    //
+                                          {4, 3, 10, 7},    //
+                                          {6, 5, 8, 5},     //
+                                          {8, 7, 6, 3},     //
+                                          {10, 9, 4, 1},    //
+                                          {12, 11, 2, 12},  //
+                                          {1, 0, 13, 10},   //
+                                          {3, 2, 11, 8},    //
+                                          {5, 4, 9, 6},     //
+                                          {7, 6, 7, 4},     //
+                                          {9, 8, 5, 2},     //
+                                          {11, 10, 3, 7},   //
+                                          {13, 12, 1, 6}};
+  absl::Duration elapsed_time;
+  int symbols[64 * 28 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol14, kNumBytesTestReadSymbol14,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][15] = {
+        // pmf: 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14,
+        // 1/14, 1/14, 1/14, 1/14
+        {32768 - 2341, 32768 - 4681, 32768 - 7022, 32768 - 9362, 32768 - 11703,
+         32768 - 14043, 32768 - 16384, 32768 - 18725, 32768 - 21065,
+         32768 - 23406, 32768 - 25746, 32768 - 28087, 32768 - 30427, 0, 0},
+        // pmf: 3/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+        // 2/28, 2/28, 2/28, 1/28
+        {32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533, 32768 - 12873,
+         32768 - 15214, 32768 - 17554, 32768 - 19895, 32768 - 22235,
+         32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0},
+        // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+        // 2/28, 2/28, 2/28, 3/28
+        {32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+         32768 - 12873, 32768 - 15214, 32768 - 17554, 32768 - 19895,
+         32768 - 22235, 32768 - 24576, 32768 - 26917, 32768 - 29257, 0, 0},
+        // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 3/28, 3/28, 2/28, 2/28,
+        // 2/28, 2/28, 2/28, 1/28
+        {32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+         32768 - 12873, 32768 - 16384, 32768 - 19895, 32768 - 22235,
+         32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 28; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<14>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 14);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol14CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol14(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 64; ++i) {
+    for (int j = 0; j < 28; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol16(int num_runs) {
+  static constexpr int kSymbols[32][4] = {{0, 8, 15, 7},    //
+                                          {1, 9, 14, 6},    //
+                                          {2, 10, 13, 5},   //
+                                          {3, 11, 12, 4},   //
+                                          {4, 12, 11, 3},   //
+                                          {5, 13, 10, 2},   //
+                                          {6, 14, 9, 1},    //
+                                          {7, 15, 8, 0},    //
+                                          {8, 0, 7, 15},    //
+                                          {9, 1, 6, 14},    //
+                                          {10, 2, 5, 13},   //
+                                          {11, 3, 4, 12},   //
+                                          {12, 4, 3, 11},   //
+                                          {13, 5, 2, 10},   //
+                                          {14, 6, 1, 9},    //
+                                          {15, 7, 0, 8},    //
+                                          {0, 0, 15, 13},   //
+                                          {2, 1, 14, 11},   //
+                                          {4, 3, 12, 9},    //
+                                          {6, 5, 10, 7},    //
+                                          {8, 7, 8, 5},     //
+                                          {10, 9, 6, 3},    //
+                                          {12, 11, 4, 1},   //
+                                          {14, 13, 2, 14},  //
+                                          {1, 0, 15, 12},   //
+                                          {3, 2, 13, 10},   //
+                                          {5, 4, 11, 8},    //
+                                          {7, 6, 9, 6},     //
+                                          {9, 8, 7, 4},     //
+                                          {11, 10, 5, 2},   //
+                                          {13, 12, 3, 8},   //
+                                          {15, 14, 1, 7}};
+  absl::Duration elapsed_time;
+  int symbols[48 * 32 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol16, kNumBytesTestReadSymbol16,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][17] = {
+        // pmf: 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16,
+        // 1/16, 1/16, 1/16, 1/16, 1/16, 1/16
+        {32768 - 2048, 32768 - 4096, 32768 - 6144, 32768 - 8192, 32768 - 10240,
+         32768 - 12288, 32768 - 14336, 32768 - 16384, 32768 - 18432,
+         32768 - 20480, 32768 - 22528, 32768 - 24576, 32768 - 26624,
+         32768 - 28672, 32768 - 30720, 0, 0},
+        // pmf: 3/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+        // 2/32, 2/32, 2/32, 2/32, 2/32, 1/32
+        {32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216, 32768 - 11264,
+         32768 - 13312, 32768 - 15360, 32768 - 17408, 32768 - 19456,
+         32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+         32768 - 29696, 32768 - 31744, 0, 0},
+        // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+        // 2/32, 2/32, 2/32, 2/32, 2/32, 3/32
+        {32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+         32768 - 11264, 32768 - 13312, 32768 - 15360, 32768 - 17408,
+         32768 - 19456, 32768 - 21504, 32768 - 23552, 32768 - 25600,
+         32768 - 27648, 32768 - 29696, 0, 0},
+        // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 3/32, 3/32, 2/32,
+        // 2/32, 2/32, 2/32, 2/32, 2/32, 1/32
+        {32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+         32768 - 11264, 32768 - 13312, 32768 - 16384, 32768 - 19456,
+         32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+         32768 - 29696, 32768 - 31744, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 48; ++i) {
+      for (int j = 0; j < 32; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<16>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 16);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol16CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol16(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 48; ++i) {
+    for (int j = 0; j < 32; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbolBoolean) {
+  TestReadSymbolBoolean</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbolBooleanCompileTime) {
+  TestReadSymbolBoolean</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol3) {
+  TestReadSymbol3</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol3CompileTime) {
+  TestReadSymbol3</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol4) {
+  TestReadSymbol4</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol4CompileTime) {
+  TestReadSymbol4</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol5) {
+  TestReadSymbol5</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol5CompileTime) {
+  TestReadSymbol5</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol6) {
+  TestReadSymbol6</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol6CompileTime) {
+  TestReadSymbol6</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol7) {
+  TestReadSymbol7</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol7CompileTime) {
+  TestReadSymbol7</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol8) {
+  TestReadSymbol8</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol8CompileTime) {
+  TestReadSymbol8</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol9) {
+  TestReadSymbol9</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol9CompileTime) {
+  TestReadSymbol9</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol10) {
+  TestReadSymbol10</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol10CompileTime) {
+  TestReadSymbol10</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol11) {
+  TestReadSymbol11</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol11CompileTime) {
+  TestReadSymbol11</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol12) {
+  TestReadSymbol12</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol12CompileTime) {
+  TestReadSymbol12</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol13) {
+  TestReadSymbol13</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol13CompileTime) {
+  TestReadSymbol13</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol14) {
+  TestReadSymbol14</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol14CompileTime) {
+  TestReadSymbol14</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol16) {
+  TestReadSymbol16</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol16CompileTime) {
+  TestReadSymbol16</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, DISABLED_Speed) {
+  // compile_time=true is only tested for those symbol_count values that have
+  // an instantiation of the EntropyDecoder::ReadSymbol<symbol_count> template
+  // method.
+  TestReadSymbolBoolean</*compile_time=*/false>(10000);
+  TestReadSymbolBoolean</*compile_time=*/true>(10000);
+  TestReadSymbol3</*compile_time=*/false>(5000);
+  TestReadSymbol3</*compile_time=*/true>(5000);
+  TestReadSymbol4</*compile_time=*/false>(2000);
+  TestReadSymbol4</*compile_time=*/true>(2000);
+  TestReadSymbol5</*compile_time=*/false>(5000);
+  TestReadSymbol5</*compile_time=*/true>(5000);
+  TestReadSymbol6</*compile_time=*/false>(5000);
+  TestReadSymbol6</*compile_time=*/true>(5000);
+  TestReadSymbol7</*compile_time=*/false>(1000);
+  TestReadSymbol7</*compile_time=*/true>(1000);
+  TestReadSymbol8</*compile_time=*/false>(1000);
+  TestReadSymbol8</*compile_time=*/true>(1000);
+  TestReadSymbol9</*compile_time=*/false>(5000);
+  TestReadSymbol9</*compile_time=*/true>(5000);
+  TestReadSymbol10</*compile_time=*/false>(5000);
+  TestReadSymbol10</*compile_time=*/true>(5000);
+  TestReadSymbol11</*compile_time=*/false>(5000);
+  TestReadSymbol11</*compile_time=*/true>(5000);
+  TestReadSymbol12</*compile_time=*/false>(5000);
+  TestReadSymbol12</*compile_time=*/true>(5000);
+  TestReadSymbol13</*compile_time=*/false>(5000);
+  TestReadSymbol13</*compile_time=*/true>(5000);
+  TestReadSymbol14</*compile_time=*/false>(5000);
+  TestReadSymbol14</*compile_time=*/true>(5000);
+  TestReadSymbol16</*compile_time=*/false>(5000);
+  TestReadSymbol16</*compile_time=*/true>(5000);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/entropy_decoder_test_data.inc b/src/utils/entropy_decoder_test_data.inc
new file mode 100644 (file)
index 0000000..9050d5e
--- /dev/null
@@ -0,0 +1,8443 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The kBytesTestReadSymbolBoolean[] array was encoded by using the following
+// libaom code:
+//
+// aom_cdf_prob cdf[4][3] = {
+//   { 16384, 0, 0 },
+//   { 32768 - 8386, 0, 0 },
+//   { 32768 - 24312, 0, 0 },
+//   { 16384, 0, 0 },
+// };
+// constexpr int kSymbols[4][4] = { { 0, 0, 1, 1 },  //
+//                                  { 0, 1, 1, 0 },  //
+//                                  { 1, 0, 1, 0 },  //
+//                                  { 1, 0, 0, 1 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 4; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 2);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbolBoolean = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbolBoolean[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbolBoolean = 1880;
+constexpr uint8_t kBytesTestReadSymbolBoolean[] = {
+    0x1e, 0xfe, 0x7c, 0xa2, 0x1e, 0xfc, 0xa1, 0x17, 0xee, 0xbf, 0x07, 0x76,
+    0x2d, 0x11, 0x3a, 0xa5, 0x49, 0x65, 0xbb, 0x83, 0x89, 0x4b, 0xaa, 0x23,
+    0x29, 0x0d, 0x81, 0x9f, 0x6a, 0xf2, 0x9f, 0x7e, 0x14, 0x9a, 0x86, 0x78,
+    0x7f, 0xd5, 0x31, 0x14, 0x45, 0x8e, 0xf5, 0xc3, 0x36, 0x63, 0xcb, 0x4f,
+    0xeb, 0x81, 0x19, 0x75, 0x3c, 0xda, 0x21, 0x71, 0x1d, 0x05, 0x34, 0x7e,
+    0x43, 0xd4, 0x5b, 0xeb, 0x0a, 0x6d, 0xbe, 0xd2, 0x8f, 0xa5, 0x8f, 0xac,
+    0x3b, 0x43, 0xb6, 0x8a, 0xf9, 0x86, 0xf7, 0x1a, 0x3c, 0x4b, 0x2b, 0x4c,
+    0x4c, 0x4a, 0xff, 0xb9, 0x6f, 0x3c, 0xeb, 0xf6, 0x4c, 0xc8, 0x3c, 0x01,
+    0x5f, 0x12, 0x76, 0x4f, 0x88, 0xa0, 0xa5, 0xe7, 0x1d, 0xb3, 0x97, 0xd8,
+    0x31, 0x90, 0x8f, 0xd1, 0x46, 0xfd, 0xf7, 0xb1, 0x02, 0x0d, 0xf3, 0x9e,
+    0xbe, 0xa2, 0xfb, 0xc2, 0x7e, 0xe8, 0x77, 0xff, 0xa8, 0x13, 0x59, 0xcd,
+    0xba, 0xe7, 0xc2, 0x7e, 0xe8, 0x77, 0xff, 0xa8, 0x0e, 0xc3, 0x7b, 0x63,
+    0x80, 0xfe, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e,
+    0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd,
+    0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33,
+    0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8,
+    0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30,
+    0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37,
+    0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb,
+    0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3,
+    0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e,
+    0x85, 0x13, 0x83, 0xe9, 0x58, 0xaf, 0xe8, 0xff, 0x03, 0xb8, 0xf5, 0x08,
+    0x63, 0x03, 0xea, 0xe9, 0x3a, 0x39, 0x6d, 0xb6, 0x32, 0xc5, 0xff, 0xf7,
+    0x19, 0x19, 0x9c, 0x29, 0x3a, 0xc5, 0x87, 0x27, 0x2d, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xac,
+};
+static_assert(sizeof(kBytesTestReadSymbolBoolean) ==
+                  kNumBytesTestReadSymbolBoolean,
+              "");
+
+// The kBytesTestReadSymbol3[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][4] = {
+//   // pdf: 1/3, 1/3, 1/3
+//   { 32768 - 10923, 32768 - 21845, 0, 0 },
+//   // pdf: 1/6, 2/6, 3/6
+//   { 32768 - 5461, 32768 - 16384, 0, 0 },
+//   // pdf: 2/6, 3/6, 1/6
+//   { 32768 - 10923, 32768 - 27307, 0, 0 },
+//   // pdf: 3/6, 1/6, 2/6
+//   { 32768 - 16384, 32768 - 21845, 0, 0 },
+// };
+// constexpr int kSymbols[6][4] = { { 0, 2, 1, 2 },  //
+//                                  { 1, 1, 2, 1 },  //
+//                                  { 2, 0, 0, 0 },  //
+//                                  { 0, 2, 0, 2 },  //
+//                                  { 1, 2, 1, 0 },  //
+//                                  { 2, 1, 1, 0 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 6; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 3);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol3 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol3[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol3 = 4646;
+constexpr uint8_t kBytesTestReadSymbol3[] = {
+    0x4a, 0xf9, 0x1a, 0x00, 0xef, 0x80, 0xd4, 0xcd, 0xc2, 0x55, 0x62, 0x76,
+    0x3a, 0x60, 0x4e, 0xc9, 0x17, 0x91, 0x86, 0xb0, 0xa0, 0xcb, 0xf7, 0x7e,
+    0x82, 0x1e, 0x92, 0xd9, 0xe5, 0xff, 0xaa, 0x0b, 0xa4, 0xc1, 0xfa, 0x0d,
+    0xbe, 0x4f, 0x17, 0x4a, 0xfd, 0xee, 0xb6, 0x9b, 0x57, 0x3e, 0xdb, 0x60,
+    0x19, 0xd2, 0xee, 0x35, 0x39, 0x73, 0xc9, 0x7b, 0x80, 0xc0, 0x9c, 0x9a,
+    0xe8, 0x0f, 0x8b, 0xb8, 0x99, 0x02, 0xde, 0x68, 0x97, 0xab, 0xee, 0x2c,
+    0xa0, 0xb1, 0x7b, 0x8e, 0x8a, 0x69, 0xd5, 0xcd, 0x40, 0x43, 0xa9, 0x4c,
+    0xd5, 0xac, 0x33, 0x70, 0x64, 0x35, 0xa1, 0x18, 0xde, 0x31, 0x21, 0x2b,
+    0xa1, 0xd2, 0x87, 0x63, 0x41, 0x4d, 0xd9, 0x0e, 0x17, 0xd8, 0x74, 0x19,
+    0xbc, 0x33, 0xee, 0xd9, 0x21, 0x22, 0x16, 0xbb, 0x1e, 0x14, 0x46, 0xcf,
+    0xfa, 0xee, 0xa2, 0xa0, 0xc0, 0x6b, 0xc5, 0xf0, 0xd8, 0x23, 0x6d, 0x20,
+    0xda, 0x75, 0xff, 0x72, 0x3d, 0x41, 0x51, 0x21, 0x23, 0xa0, 0xce, 0xa0,
+    0x46, 0xb0, 0x1d, 0x3d, 0xaf, 0x64, 0xf8, 0x57, 0xee, 0x81, 0x55, 0x3a,
+    0xea, 0xd3, 0x3f, 0x96, 0x52, 0x31, 0xe5, 0xb5, 0x70, 0x01, 0x5a, 0xaf,
+    0xbc, 0x69, 0x7e, 0x43, 0xdd, 0x2f, 0xe2, 0x40, 0xc7, 0x2d, 0x62, 0x8e,
+    0xf0, 0x2a, 0xc0, 0x06, 0xe7, 0xe0, 0x63, 0x6e, 0x09, 0xa0, 0x57, 0x83,
+    0x43, 0x5a, 0xe8, 0xb5, 0xc7, 0x1b, 0xf5, 0xe6, 0x3d, 0x19, 0xeb, 0xfa,
+    0xda, 0x3d, 0x06, 0x3e, 0xa8, 0x96, 0x09, 0xad, 0x1d, 0xac, 0xf6, 0xef,
+    0xc7, 0x32, 0x2f, 0x45, 0xe0, 0x4f, 0xa6, 0x9c, 0x2f, 0x66, 0x6b, 0xe3,
+    0x36, 0xcf, 0x36, 0x41, 0xcb, 0xd9, 0xb8, 0xc3, 0x48, 0xf4, 0x18, 0xfa,
+    0xa2, 0x58, 0x26, 0xb4, 0x76, 0xb3, 0xdb, 0xbf, 0x1c, 0xc8, 0xbd, 0x19,
+    0xc1, 0x3e, 0x9a, 0x71, 0x85, 0x52, 0x94, 0x82, 0x48, 0x9c, 0x90, 0xcf,
+    0x2f, 0xa0, 0xd1, 0x4b, 0x73, 0xcf, 0x73, 0xea, 0x89, 0x60, 0x93, 0xd1,
+    0xda, 0xcf, 0x74, 0x5b, 0xd3, 0x22, 0xf4, 0x67, 0x04, 0xfa, 0x69, 0xc6,
+    0x15, 0x4a, 0x52, 0x09, 0x22, 0x72, 0x43, 0x3c, 0xbe, 0x83, 0x45, 0x2d,
+    0xcf, 0x3d, 0xcf, 0xaa, 0x25, 0x82, 0x4f, 0x47, 0x6b, 0x3d, 0xd1, 0x6f,
+    0x4c, 0x8b, 0xd1, 0x9c, 0x13, 0xe9, 0xa7, 0x18, 0x55, 0x29, 0x48, 0x24,
+    0x89, 0xc9, 0x0c, 0xf2, 0xfa, 0x0d, 0x14, 0xb7, 0x3c, 0xf7, 0x3e, 0xa8,
+    0x96, 0x09, 0x3d, 0x1d, 0xac, 0xf7, 0x45, 0xbd, 0x32, 0x2f, 0x46, 0x70,
+    0x4f, 0xa6, 0x9c, 0x61, 0x54, 0xa5, 0x20, 0x92, 0x27, 0x24, 0x33, 0xcb,
+    0xe8, 0x34, 0x52, 0xdc, 0xf3, 0xdc, 0xfa, 0xa2, 0x58, 0x24, 0xf4, 0x76,
+    0xb3, 0xdd, 0x16, 0xf4, 0xc8, 0xbd, 0x19, 0xc1, 0x3e, 0x9a, 0x71, 0x85,
+    0x52, 0x94, 0x82, 0x48, 0x9c, 0x90, 0xcf, 0x2f, 0xa0, 0xd1, 0x4b, 0x73,
+    0xcf, 0x73, 0xea, 0x89, 0x60, 0x93, 0xd1, 0xda, 0xcf, 0x74, 0x5b, 0xd3,
+    0x22, 0xf4, 0x67, 0x04, 0xfa, 0x69, 0xc6, 0x15, 0x4a, 0x52, 0x09, 0x22,
+    0x72, 0x43, 0x3c, 0xbe, 0x83, 0x45, 0x2d, 0xcf, 0x3d, 0xcf, 0xaa, 0x25,
+    0x84, 0xaa, 0xde, 0xde, 0xba, 0x7e, 0x90, 0x92, 0xa0, 0xdc, 0xb3, 0x6c,
+    0xaf, 0xe6, 0x2f, 0xeb, 0xc5, 0x33, 0xe7, 0x77, 0xcf, 0xda, 0xe7, 0x31,
+    0x57, 0xb2, 0x8f, 0xde, 0x8f, 0x1d, 0xf4, 0xd3, 0x8c, 0xda, 0x94, 0xa4,
+    0x12, 0xcd, 0xc9, 0x32, 0x6d, 0xf7, 0x2d, 0x0c, 0x2c, 0xf9, 0xd8, 0x0b,
+    0x48, 0xf3, 0xb3, 0x2e, 0x80, 0xd7, 0x0a, 0xc4, 0x4f, 0x09, 0xfe, 0x84,
+    0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+    0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+    0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+    0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+    0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+    0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+    0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+    0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+    0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+    0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+    0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+    0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+    0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+    0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+    0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+    0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+    0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+    0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+    0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+    0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+    0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+    0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+    0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+    0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+    0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+    0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+    0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+    0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+    0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+    0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+    0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+    0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+    0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+    0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+    0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+    0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+    0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+    0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+    0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+    0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+    0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+    0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+    0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+    0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+    0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+    0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+    0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+    0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+    0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+    0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+    0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+    0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+    0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+    0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+    0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+    0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+    0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+    0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+    0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+    0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+    0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+    0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+    0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+    0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+    0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+    0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+    0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+    0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+    0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+    0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+    0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+    0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+    0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+    0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+    0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+    0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+    0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+    0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+    0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+    0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+    0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+    0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+    0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+    0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+    0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+    0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+    0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+    0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+    0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+    0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+    0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+    0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+    0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa,
+    0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e,
+    0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95,
+    0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf,
+    0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc,
+    0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e,
+    0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54,
+    0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10,
+    0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86,
+    0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e,
+    0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42,
+    0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6,
+    0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03,
+    0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0,
+    0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b,
+    0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10,
+    0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd,
+    0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97,
+    0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f,
+    0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81,
+    0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9,
+    0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b,
+    0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde,
+    0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32,
+    0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a,
+    0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6,
+    0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa,
+    0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76,
+    0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f,
+    0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0,
+    0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81,
+    0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac,
+    0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95,
+    0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f,
+    0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84,
+    0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+    0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+    0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+    0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+    0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+    0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+    0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+    0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+    0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+    0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+    0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+    0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+    0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+    0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+    0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+    0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+    0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+    0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+    0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+    0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+    0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+    0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+    0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+    0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+    0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+    0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+    0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+    0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+    0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+    0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+    0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+    0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+    0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+    0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+    0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+    0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+    0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+    0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+    0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+    0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+    0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+    0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+    0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+    0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+    0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+    0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+    0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+    0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+    0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+    0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+    0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+    0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+    0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+    0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+    0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+    0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+    0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+    0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+    0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+    0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+    0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+    0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+    0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+    0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+    0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+    0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+    0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+    0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+    0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+    0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+    0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+    0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+    0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+    0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+    0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+    0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+    0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+    0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+    0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+    0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+    0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+    0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+    0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+    0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+    0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+    0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+    0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+    0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+    0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+    0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+    0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+    0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+    0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa,
+    0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e,
+    0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95,
+    0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf,
+    0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc,
+    0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e,
+    0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54,
+    0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10,
+    0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86,
+    0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e,
+    0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42,
+    0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6,
+    0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03,
+    0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0,
+    0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b,
+    0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10,
+    0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd,
+    0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97,
+    0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f,
+    0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81,
+    0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9,
+    0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b,
+    0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde,
+    0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32,
+    0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a,
+    0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6,
+    0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa,
+    0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76,
+    0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f,
+    0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0,
+    0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81,
+    0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac,
+    0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95,
+    0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f,
+    0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84,
+    0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+    0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+    0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+    0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+    0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+    0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+    0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+    0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+    0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+    0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+    0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+    0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+    0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+    0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+    0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+    0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+    0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+    0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+    0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+    0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+    0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+    0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+    0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+    0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+    0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+    0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+    0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+    0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+    0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+    0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+    0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+    0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+    0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+    0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+    0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+    0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+    0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+    0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+    0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+    0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+    0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+    0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+    0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+    0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+    0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+    0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+    0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+    0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+    0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+    0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+    0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+    0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+    0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+    0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+    0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+    0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+    0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+    0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+    0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+    0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+    0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+    0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+    0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+    0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+    0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+    0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+    0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+    0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+    0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+    0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+    0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+    0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+    0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+    0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+    0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+    0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+    0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+    0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+    0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+    0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+    0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+    0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+    0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+    0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+    0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+    0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+    0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+    0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+    0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+    0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+    0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+    0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+    0x41, 0x08,
+};
+static_assert(sizeof(kBytesTestReadSymbol3) == kNumBytesTestReadSymbol3, "");
+
+// The kBytesTestReadSymbol4[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][5] = {
+//   // pdf: 1/4, 1/4, 1/4, 1/4
+//   { 32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0 },
+//   // pdf: 2/8, 1/8, 2/8, 3/8
+//   { 32768 - 8192, 32768 - 12288, 32768 - 20480, 0, 0 },
+//   // pdf: 1/4, 1/4, 1/4, 1/4
+//   { 32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0 },
+//   // pdf: 2/8, 3/8, 2/8, 1/8
+//   { 32768 - 8192, 32768 - 20480, 32768 - 28672, 0, 0 },
+// };
+// constexpr int kSymbols[8][4] = { { 0, 0, 3, 3 },  //
+//                                  { 0, 0, 2, 2 },  //
+//                                  { 1, 1, 0, 0 },  //
+//                                  { 1, 2, 1, 1 },  //
+//                                  { 2, 2, 3, 2 },  //
+//                                  { 2, 3, 2, 1 },  //
+//                                  { 3, 3, 0, 0 },  //
+//                                  { 3, 3, 1, 1 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 4);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol4 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol4[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol4 = 8055;
+constexpr uint8_t kBytesTestReadSymbol4[] = {
+    0x0f, 0x9b, 0x2a, 0xf6, 0x38, 0x26, 0xa1, 0xd1, 0x82, 0x5f, 0x34, 0xb5,
+    0xc7, 0xda, 0x9c, 0xd8, 0x8d, 0x4b, 0xbc, 0x5c, 0x0b, 0x8a, 0x7f, 0x6c,
+    0x46, 0x3f, 0xa2, 0x03, 0xee, 0x1f, 0xea, 0x25, 0xc7, 0xb7, 0xe2, 0xc9,
+    0x51, 0x0f, 0x7c, 0x0c, 0xe3, 0x7d, 0x7b, 0xe4, 0xbe, 0xde, 0x41, 0x5c,
+    0x5a, 0xcf, 0xe6, 0x12, 0x50, 0x7b, 0xcc, 0x83, 0x76, 0x61, 0x03, 0x3a,
+    0x1e, 0x1b, 0xf8, 0x9d, 0x08, 0x96, 0x98, 0x0f, 0x16, 0xac, 0x7c, 0x25,
+    0x6c, 0xd1, 0xe8, 0xd8, 0xd6, 0x1c, 0xbd, 0x48, 0xa5, 0x3f, 0xd3, 0x21,
+    0x4c, 0x4e, 0x94, 0xe3, 0xe3, 0xed, 0x30, 0x70, 0xdb, 0x2e, 0x95, 0xd5,
+    0x7f, 0xfe, 0xed, 0x0e, 0x73, 0xe3, 0x29, 0x09, 0x5f, 0xe3, 0x0e, 0xa6,
+    0xe7, 0xc6, 0x52, 0x12, 0xba, 0xdb, 0xb5, 0x63, 0xd9, 0xd8, 0xa4, 0x25,
+    0x75, 0xb7, 0x6a, 0xc7, 0xb3, 0xad, 0x88, 0x46, 0x64, 0x3a, 0x36, 0xb1,
+    0x2f, 0xb1, 0x03, 0xdb, 0x88, 0x74, 0x6d, 0x62, 0x5f, 0x62, 0x07, 0xb7,
+    0x10, 0xe8, 0xda, 0xc6, 0x1d, 0x6e, 0x8e, 0x12, 0x58, 0x6e, 0x98, 0x4c,
+    0xa1, 0x23, 0xc0, 0x9b, 0xb0, 0xdd, 0x31, 0xef, 0x64, 0xf0, 0x91, 0x37,
+    0x61, 0xba, 0x63, 0xde, 0xc9, 0xe1, 0x22, 0x6e, 0xc3, 0x74, 0xc7, 0xea,
+    0xcb, 0x70, 0xf6, 0xe2, 0x1d, 0x1b, 0x6c, 0xd5, 0x4f, 0x91, 0xc2, 0x4b,
+    0x0a, 0xeb, 0xb3, 0x0d, 0x59, 0x39, 0x13, 0x76, 0x15, 0xd7, 0x66, 0x1a,
+    0xf2, 0x72, 0x26, 0xec, 0x05, 0x3e, 0xcc, 0x31, 0x3e, 0x60, 0x4d, 0xd8,
+    0x0a, 0x7d, 0x98, 0x62, 0x7c, 0xc0, 0xcc, 0x5a, 0x24, 0xc8, 0xa6, 0xda,
+    0xe3, 0x09, 0x35, 0x70, 0x9c, 0x4c, 0x85, 0xac, 0x6f, 0x8b, 0x76, 0x30,
+    0xcc, 0x6f, 0xcb, 0x3e, 0x36, 0xd6, 0xec, 0x61, 0x98, 0xdf, 0x99, 0xa5,
+    0x7e, 0x2d, 0xd8, 0xc3, 0x31, 0xbf, 0x33, 0x4a, 0xfc, 0x5b, 0xb1, 0x86,
+    0x63, 0x7e, 0x66, 0x95, 0xf8, 0xb7, 0x63, 0x0c, 0xc6, 0xfc, 0xcd, 0x2b,
+    0xf1, 0x6e, 0xc6, 0x19, 0x8d, 0xf9, 0x9a, 0x57, 0xe2, 0xdd, 0x8c, 0x33,
+    0x1b, 0xf3, 0x34, 0xaf, 0xc5, 0xbb, 0x18, 0x66, 0x37, 0xe6, 0x69, 0x5f,
+    0x8b, 0x76, 0x30, 0xcc, 0x6f, 0xcc, 0xd2, 0xbf, 0x16, 0xec, 0x61, 0x98,
+    0xdf, 0x99, 0xa5, 0x7e, 0x2d, 0xd1, 0x27, 0xb1, 0xbf, 0x30, 0x0b, 0xfc,
+    0x5b, 0xa2, 0x4f, 0x63, 0xa0, 0x9b, 0x7a, 0xb6, 0xb7, 0x44, 0x9e, 0xc7,
+    0x41, 0x36, 0xf5, 0x6d, 0x6e, 0x89, 0x3d, 0x8e, 0x82, 0x6d, 0xea, 0xda,
+    0xdd, 0x12, 0x7b, 0x1d, 0x04, 0xdb, 0xd5, 0xb5, 0xba, 0x24, 0xf6, 0x3a,
+    0x09, 0xb7, 0xab, 0x6b, 0x74, 0x49, 0xec, 0x74, 0x13, 0x6f, 0x56, 0xd6,
+    0xe8, 0x93, 0xd8, 0xe8, 0x26, 0xde, 0xad, 0xad, 0xd1, 0x27, 0xb1, 0xd0,
+    0x4d, 0xbd, 0x5b, 0x5b, 0xa2, 0x4f, 0x63, 0xa0, 0x9b, 0x7a, 0xb6, 0xb7,
+    0x44, 0x9e, 0xc7, 0x41, 0x36, 0xf5, 0x6d, 0x6e, 0x89, 0x3d, 0x8e, 0x82,
+    0x6d, 0xea, 0xda, 0xdd, 0x12, 0x7b, 0x1d, 0x04, 0xdb, 0xd5, 0xb5, 0xba,
+    0x24, 0xf6, 0x3a, 0x09, 0xb7, 0xab, 0x6b, 0x74, 0x49, 0xec, 0x74, 0x13,
+    0x6f, 0x56, 0xd6, 0xdf, 0x45, 0xaa, 0x16, 0xb7, 0xb7, 0x14, 0x09, 0xdb,
+    0x9f, 0x17, 0x97, 0xae, 0xa1, 0xbe, 0x34, 0x9d, 0x0e, 0x01, 0x9f, 0xdb,
+    0x16, 0xa9, 0x6a, 0x63, 0xf2, 0x9f, 0x5b, 0x3b, 0x0b, 0xae, 0x17, 0xd6,
+    0x4d, 0x75, 0x8f, 0xe3, 0xf0, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c,
+    0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96,
+    0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f,
+    0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80,
+    0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f,
+    0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d,
+    0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a,
+    0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53,
+    0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91,
+    0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9,
+    0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95,
+    0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3,
+    0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7,
+    0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42,
+    0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7,
+    0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3,
+    0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65,
+    0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8,
+    0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34,
+    0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3,
+    0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38,
+    0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5,
+    0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d,
+    0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08,
+    0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6,
+    0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7,
+    0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2,
+    0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59,
+    0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52,
+    0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81,
+    0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa,
+    0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29,
+    0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f,
+    0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0,
+    0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a,
+    0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e,
+    0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65,
+    0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e,
+    0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4,
+    0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8,
+    0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b,
+    0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c,
+    0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e,
+    0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01,
+    0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07,
+    0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd,
+    0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8,
+    0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca,
+    0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe,
+    0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab,
+    0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a,
+    0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d,
+    0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09,
+    0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89,
+    0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61,
+    0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1,
+    0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c,
+    0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4,
+    0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4,
+    0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3,
+    0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a,
+    0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53,
+    0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa,
+    0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94,
+    0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f,
+    0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24,
+    0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7,
+    0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd,
+    0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a,
+    0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f,
+    0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e,
+    0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb,
+    0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe,
+    0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19,
+    0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad,
+    0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49,
+    0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad,
+    0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc,
+    0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50,
+    0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87,
+    0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce,
+    0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5,
+    0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2,
+    0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a,
+    0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14,
+    0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad,
+    0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71,
+    0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1,
+    0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46,
+    0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b,
+    0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb,
+    0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab,
+    0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b,
+    0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5,
+    0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d,
+    0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0,
+    0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3,
+    0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0,
+    0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00,
+    0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40,
+    0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66,
+    0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd,
+    0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e,
+    0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf,
+    0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15,
+    0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98,
+    0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4,
+    0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38,
+    0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84,
+    0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb,
+    0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5,
+    0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32,
+    0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5,
+    0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae,
+    0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe,
+    0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a,
+    0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12,
+    0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f,
+    0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64,
+    0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3,
+    0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79,
+    0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06,
+    0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f,
+    0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b,
+    0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3,
+    0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c,
+    0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f,
+    0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f,
+    0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8,
+    0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d,
+    0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba,
+    0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15,
+    0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d,
+    0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a,
+    0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c,
+    0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6,
+    0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7,
+    0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d,
+    0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3,
+    0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8,
+    0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d,
+    0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43,
+    0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36,
+    0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a,
+    0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12,
+    0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce,
+    0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95,
+    0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09,
+    0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55,
+    0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48,
+    0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd,
+    0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06,
+    0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6,
+    0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0,
+    0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a,
+    0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73,
+    0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6,
+    0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2,
+    0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd,
+    0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0,
+    0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74,
+    0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e,
+    0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39,
+    0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c,
+    0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7,
+    0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55,
+    0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1,
+    0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e,
+    0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55,
+    0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f,
+    0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b,
+    0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48,
+    0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a,
+    0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b,
+    0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65,
+    0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3,
+    0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0,
+    0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b,
+    0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3,
+    0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e,
+    0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4,
+    0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4,
+    0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a,
+    0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25,
+    0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38,
+    0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed,
+    0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0,
+    0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9,
+    0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74,
+    0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59,
+    0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6,
+    0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd,
+    0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c,
+    0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e,
+    0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d,
+    0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3,
+    0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82,
+    0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39,
+    0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71,
+    0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8,
+    0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96,
+    0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54,
+    0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0,
+    0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a,
+    0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a,
+    0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f,
+    0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30,
+    0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e,
+    0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf,
+    0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59,
+    0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb,
+    0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d,
+    0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e,
+    0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86,
+    0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f,
+    0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03,
+    0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00,
+    0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01,
+    0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33,
+    0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e,
+    0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72,
+    0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff,
+    0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa,
+    0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2,
+    0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23,
+    0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2,
+    0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22,
+    0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8,
+    0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28,
+    0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93,
+    0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad,
+    0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75,
+    0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0,
+    0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6,
+    0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94,
+    0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e,
+    0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25,
+    0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b,
+    0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9,
+    0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31,
+    0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff,
+    0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e,
+    0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f,
+    0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3,
+    0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a,
+    0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f,
+    0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46,
+    0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb,
+    0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2,
+    0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab,
+    0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef,
+    0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4,
+    0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61,
+    0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33,
+    0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d,
+    0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec,
+    0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a,
+    0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45,
+    0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b,
+    0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c,
+    0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4,
+    0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1,
+    0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92,
+    0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76,
+    0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa,
+    0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e,
+    0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9,
+    0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43,
+    0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec,
+    0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30,
+    0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0,
+    0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80,
+    0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50,
+    0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99,
+    0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33,
+    0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13,
+    0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef,
+    0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05,
+    0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6,
+    0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71,
+    0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce,
+    0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61,
+    0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e,
+    0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9,
+    0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c,
+    0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5,
+    0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab,
+    0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f,
+    0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e,
+    0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44,
+    0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53,
+    0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59,
+    0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c,
+    0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e,
+    0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01,
+    0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf,
+    0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a,
+    0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4,
+    0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7,
+    0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23,
+    0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3,
+    0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a,
+    0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7,
+    0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e,
+    0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85,
+    0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf,
+    0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6,
+    0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb,
+    0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1,
+    0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69,
+    0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67,
+    0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70,
+    0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a,
+    0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b,
+    0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10,
+    0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd,
+    0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e,
+    0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44,
+    0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3,
+    0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5,
+    0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02,
+    0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55,
+    0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52,
+    0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f,
+    0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81,
+    0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5,
+    0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc,
+    0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca,
+    0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc,
+    0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69,
+    0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70,
+    0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37,
+    0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8,
+    0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d,
+    0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03,
+    0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e,
+    0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b,
+    0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71,
+    0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95,
+    0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc,
+    0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57,
+    0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15,
+    0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b,
+    0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12,
+    0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12,
+    0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2,
+    0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42,
+    0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99,
+    0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68,
+    0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8,
+    0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86,
+    0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4,
+    0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7,
+    0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5,
+    0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29,
+    0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde,
+    0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49,
+    0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e,
+    0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb,
+    0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4,
+    0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe,
+    0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d,
+    0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6,
+    0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd,
+    0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33,
+    0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b,
+    0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93,
+    0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b,
+    0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78,
+    0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0,
+    0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e,
+    0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c,
+    0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea,
+    0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65,
+    0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5,
+    0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28,
+    0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a,
+    0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2,
+    0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3,
+    0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c,
+    0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97,
+    0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7,
+    0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56,
+    0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76,
+    0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b,
+    0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b,
+    0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61,
+    0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87,
+    0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80,
+    0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00,
+    0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80,
+    0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc,
+    0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b,
+    0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c,
+    0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f,
+    0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a,
+    0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30,
+    0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88,
+    0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70,
+    0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08,
+    0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6,
+    0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a,
+    0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64,
+    0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab,
+    0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d,
+    0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc,
+    0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5,
+    0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25,
+    0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f,
+    0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9,
+    0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66,
+    0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2,
+    0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c,
+    0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff,
+    0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7,
+    0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7,
+    0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38,
+    0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e,
+    0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f,
+    0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51,
+    0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a,
+    0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74,
+    0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a,
+    0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b,
+    0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35,
+    0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58,
+    0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c,
+    0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f,
+    0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b,
+    0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86,
+    0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51,
+    0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda,
+    0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87,
+    0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d,
+    0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74,
+    0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24,
+    0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d,
+    0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a,
+    0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13,
+    0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa,
+    0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90,
+    0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb,
+    0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c,
+    0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac,
+    0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0,
+    0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54,
+    0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6,
+    0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c,
+    0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84,
+    0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb,
+    0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1,
+    0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9,
+    0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c,
+    0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73,
+    0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8,
+    0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f,
+    0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa,
+    0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3,
+    0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd,
+    0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa,
+    0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf,
+    0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97,
+    0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91,
+    0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14,
+    0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16,
+    0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb,
+    0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47,
+    0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40,
+    0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37,
+    0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6,
+    0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d,
+    0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9,
+    0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48,
+    0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4,
+    0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a,
+    0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71,
+    0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb,
+    0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1,
+    0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3,
+    0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9,
+    0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2,
+    0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec,
+    0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a,
+    0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9,
+    0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c,
+    0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda,
+    0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6,
+    0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04,
+    0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73,
+    0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3,
+    0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51,
+    0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c,
+    0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9,
+    0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40,
+    0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5,
+    0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14,
+    0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f,
+    0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60,
+    0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd,
+    0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf,
+    0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2,
+    0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7,
+    0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a,
+    0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc,
+    0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d,
+    0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e,
+    0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07,
+    0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00,
+    0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03,
+    0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66,
+    0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc,
+    0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5,
+    0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff,
+    0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55,
+    0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85,
+    0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46,
+    0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84,
+    0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44,
+    0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0,
+    0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50,
+    0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26,
+    0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a,
+    0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea,
+    0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1,
+    0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad,
+    0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29,
+    0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd,
+    0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a,
+    0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37,
+    0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92,
+    0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63,
+    0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe,
+    0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd,
+    0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f,
+    0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7,
+    0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5,
+    0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff,
+    0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c,
+    0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6,
+    0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4,
+    0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56,
+    0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde,
+    0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8,
+    0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3,
+    0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67,
+    0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a,
+    0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9,
+    0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35,
+    0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a,
+    0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6,
+    0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38,
+    0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68,
+    0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3,
+    0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25,
+    0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed,
+    0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55,
+    0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d,
+    0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52,
+    0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86,
+    0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8,
+    0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61,
+    0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60,
+    0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00,
+    0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0,
+    0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33,
+    0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66,
+    0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27,
+    0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf,
+    0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a,
+    0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c,
+    0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2,
+    0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c,
+    0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2,
+    0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d,
+    0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52,
+    0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19,
+    0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea,
+    0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57,
+    0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff,
+    0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd,
+    0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89,
+    0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7,
+    0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2,
+    0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59,
+    0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c,
+    0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03,
+    0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf,
+    0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35,
+    0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9,
+    0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e,
+    0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47,
+    0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7,
+    0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54,
+    0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e,
+    0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd,
+    0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a,
+    0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e,
+    0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d,
+    0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96,
+    0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63,
+    0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3,
+    0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce,
+    0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1,
+    0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4,
+    0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36,
+    0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21,
+    0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b,
+    0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d,
+    0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89,
+    0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67,
+    0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a,
+    0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04,
+    0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa,
+    0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4,
+    0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe,
+    0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03,
+    0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb,
+    0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8,
+    0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95,
+    0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9,
+    0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3,
+    0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1,
+    0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e,
+    0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0,
+    0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a,
+    0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07,
+    0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c,
+    0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36,
+    0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3,
+    0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a,
+    0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8,
+    0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf,
+    0x56, 0x8f, 0x24,
+};
+static_assert(sizeof(kBytesTestReadSymbol4) == kNumBytesTestReadSymbol4, "");
+
+// The kBytesTestReadSymbol5[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][6] = {
+//   // pdf: 1/5, 1/5, 1/5, 1/5, 1/5
+//   { 32768 - 6554, 32768 - 13107, 32768 - 19661, 32768 - 26214, 0, 0 },
+//   // pdf: 3/10, 2/10, 2/10, 2/10, 1/10
+//   { 32768 - 9830, 32768 - 16384, 32768 - 22938, 32768 - 29491, 0, 0 },
+//   // pdf: 1/10, 2/10, 2/10, 2/10, 3/10
+//   { 32768 - 3277, 32768 - 9830, 32768 - 16384, 32768 - 22938, 0, 0 },
+//   // pdf: 1/10, 2/10, 4/10, 2/10, 1/10
+//   { 32768 - 3277, 32768 - 9830, 32768 - 22938, 32768 - 29491, 0, 0 },
+// };
+// constexpr int kSymbols[10][4] = { { 0, 0, 4, 4 },  //
+//                                   { 0, 1, 3, 3 },  //
+//                                   { 1, 2, 2, 2 },  //
+//                                   { 1, 3, 1, 1 },  //
+//                                   { 2, 4, 0, 0 },  //
+//                                   { 2, 0, 4, 3 },  //
+//                                   { 3, 1, 3, 2 },  //
+//                                   { 3, 2, 2, 1 },  //
+//                                   { 4, 3, 1, 2 },  //
+//                                   { 4, 0, 4, 2 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 320; ++i) {
+//   for (int j = 0; j < 10; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 5);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol5 = 3612;
+constexpr uint8_t kBytesTestReadSymbol5[] = {
+    0x0f, 0x1c, 0x16, 0x78, 0x6f, 0x83, 0xfe, 0x29, 0x95, 0x9a, 0x42, 0xcc,
+    0x70, 0x9a, 0x0d, 0x72, 0xe0, 0x7d, 0x63, 0x9e, 0x05, 0x3c, 0x88, 0x22,
+    0x40, 0x57, 0x83, 0xa8, 0x69, 0x6f, 0xc3, 0xb2, 0x58, 0x6c, 0xa9, 0x41,
+    0x3c, 0x2f, 0x3f, 0xa3, 0xe6, 0x4e, 0x5e, 0xaf, 0x42, 0x56, 0x9d, 0x3f,
+    0x70, 0xeb, 0x00, 0x02, 0x86, 0x23, 0x5f, 0x8e, 0x1b, 0x35, 0x71, 0x7d,
+    0x50, 0xbe, 0xb1, 0x1e, 0xe9, 0x2f, 0x08, 0x5a, 0x04, 0xc0, 0x7b, 0x98,
+    0x20, 0xbd, 0xc5, 0x39, 0xf7, 0x93, 0x5c, 0x6c, 0x4a, 0x0f, 0x50, 0x24,
+    0xe1, 0xf3, 0x2a, 0x8d, 0x53, 0x55, 0x9a, 0xd6, 0x3a, 0xd3, 0xd6, 0x9c,
+    0x41, 0xa2, 0x2c, 0x05, 0x1c, 0x5a, 0x28, 0x8d, 0xc0, 0x4f, 0x8d, 0xc1,
+    0x40, 0xaa, 0x19, 0xbf, 0xa7, 0x93, 0x48, 0xdf, 0x54, 0xcf, 0xb4, 0x47,
+    0xc4, 0x39, 0x90, 0xbb, 0xff, 0xb4, 0x47, 0x65, 0x33, 0x34, 0x45, 0x23,
+    0x5e, 0x79, 0xc5, 0xbd, 0x24, 0x30, 0x58, 0x8a, 0x19, 0x68, 0xbb, 0x08,
+    0xaa, 0xff, 0xce, 0x68, 0x37, 0xb4, 0x62, 0x44, 0x31, 0xe8, 0x3e, 0x4d,
+    0x05, 0x1d, 0xe2, 0x48, 0x56, 0xd5, 0x53, 0x19, 0xcc, 0xfd, 0x82, 0xa7,
+    0x06, 0xc4, 0x66, 0x95, 0x6c, 0x43, 0x3d, 0x43, 0x86, 0xe3, 0x62, 0x51,
+    0x26, 0x1c, 0x57, 0xed, 0x9a, 0x1a, 0x14, 0x4f, 0x41, 0x96, 0xc0, 0x72,
+    0x38, 0x59, 0xff, 0x69, 0xae, 0x2b, 0x59, 0x65, 0x30, 0xfd, 0xa5, 0x6f,
+    0x1b, 0xab, 0x01, 0x72, 0xb4, 0xcd, 0xba, 0x44, 0x73, 0x12, 0x31, 0xee,
+    0x83, 0x08, 0x5c, 0x35, 0x41, 0x17, 0xf1, 0x80, 0x55, 0xdd, 0x67, 0xb2,
+    0xd3, 0xe1, 0x04, 0x51, 0x69, 0x9b, 0x4b, 0x98, 0xcf, 0x17, 0x0a, 0xd4,
+    0xdc, 0x61, 0xf2, 0xb9, 0x4b, 0x23, 0xb6, 0xe8, 0x0c, 0x0d, 0xda, 0x68,
+    0xac, 0xd9, 0xf4, 0x11, 0x63, 0x4a, 0x7f, 0x17, 0x69, 0xdb, 0x91, 0x1b,
+    0x1d, 0xfb, 0x74, 0x58, 0x69, 0xcc, 0xf5, 0xce, 0x0d, 0x1e, 0xdd, 0x6d,
+    0x2e, 0x87, 0xf2, 0x36, 0x39, 0x22, 0x59, 0x78, 0x01, 0x2c, 0xf0, 0xe6,
+    0x8c, 0xd1, 0xdb, 0xa4, 0xf4, 0xc4, 0x09, 0x0e, 0xfe, 0x93, 0x88, 0x90,
+    0x3e, 0x55, 0x60, 0x51, 0x6a, 0xe9, 0x26, 0x41, 0x1f, 0x18, 0xab, 0xc1,
+    0xa4, 0x66, 0x57, 0xdd, 0xe6, 0x88, 0xbd, 0x74, 0xa0, 0xd3, 0x65, 0x0d,
+    0x04, 0xe3, 0x97, 0x1e, 0x9b, 0x59, 0xfc, 0xe2, 0x45, 0x9b, 0x90, 0xe1,
+    0x80, 0x20, 0x85, 0x03, 0x06, 0x1f, 0x46, 0xb1, 0x69, 0xb4, 0xf3, 0x06,
+    0xa8, 0xb5, 0x78, 0x2c, 0x21, 0xd1, 0x67, 0x8d, 0x91, 0xef, 0x6f, 0xec,
+    0xed, 0x2c, 0xd7, 0x40, 0x32, 0x09, 0xed, 0x4e, 0x92, 0xbb, 0x28, 0x67,
+    0xac, 0x09, 0x50, 0x7f, 0x30, 0xed, 0xde, 0x56, 0xeb, 0xc9, 0x23, 0x2f,
+    0x13, 0x07, 0xef, 0x80, 0x9e, 0x83, 0x6a, 0x24, 0xd4, 0xd1, 0x84, 0xbe,
+    0xf8, 0x1f, 0xb0, 0xaa, 0x6a, 0xf0, 0xda, 0x02, 0x0c, 0x94, 0xc9, 0xbc,
+    0x0f, 0xe8, 0x76, 0x95, 0x79, 0x0e, 0x24, 0x1e, 0x4c, 0xdb, 0xe5, 0xd5,
+    0x20, 0xee, 0x13, 0xff, 0xba, 0x1f, 0x7f, 0x67, 0x89, 0x4b, 0x6b, 0x28,
+    0x33, 0x61, 0xfb, 0x53, 0xed, 0xf7, 0x13, 0x3f, 0x64, 0xc9, 0x26, 0x19,
+    0xde, 0xe6, 0xec, 0x74, 0xe0, 0x0e, 0x7b, 0x07, 0xeb, 0xd9, 0xac, 0x7e,
+    0x1d, 0xac, 0xba, 0xa0, 0x50, 0xc4, 0x12, 0xee, 0x58, 0xe5, 0xe9, 0x7c,
+    0xa3, 0x40, 0xbd, 0x92, 0x6d, 0xa8, 0x08, 0x3c, 0x9e, 0xdb, 0xd3, 0x08,
+    0x3d, 0xb3, 0x1c, 0x25, 0x09, 0x51, 0x55, 0xbb, 0x51, 0xc8, 0xe6, 0xd6,
+    0x30, 0x86, 0x25, 0xa9, 0x01, 0xed, 0x55, 0x11, 0xa4, 0x5e, 0x3f, 0x57,
+    0xb7, 0x9b, 0x64, 0xec, 0x3d, 0x93, 0x28, 0x34, 0xea, 0xe9, 0x53, 0xec,
+    0x71, 0x7c, 0x1c, 0xee, 0x03, 0x26, 0x1a, 0x15, 0x9f, 0x6c, 0x74, 0xa5,
+    0xe1, 0x04, 0x76, 0xcb, 0x0b, 0xf9, 0x96, 0x4f, 0x4e, 0xb6, 0x7e, 0xad,
+    0xc5, 0x4b, 0x37, 0x44, 0x91, 0xfd, 0x1d, 0x69, 0x11, 0x17, 0x82, 0xc4,
+    0x17, 0x39, 0x29, 0x99, 0x8f, 0xe1, 0x35, 0x4d, 0x9e, 0x4f, 0xc9, 0x98,
+    0x71, 0x6b, 0xa9, 0x0d, 0x0a, 0xf8, 0xb6, 0x3a, 0x52, 0xf0, 0x82, 0x3b,
+    0x65, 0x79, 0x60, 0x16, 0xa5, 0xa4, 0xf8, 0x0e, 0xc2, 0x3e, 0xf3, 0x23,
+    0x82, 0x4d, 0x1f, 0x9d, 0x7b, 0xe1, 0xb8, 0xd3, 0x79, 0xc4, 0x04, 0x1d,
+    0xfc, 0xbc, 0xdb, 0x37, 0x73, 0x27, 0xe3, 0x8d, 0x65, 0xcb, 0x72, 0xd2,
+    0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x13, 0x0d, 0x80, 0xf6, 0xaa, 0x90,
+    0xd2, 0x30, 0x87, 0x1b, 0xdb, 0xcd, 0xb9, 0xea, 0x28, 0xfa, 0x10, 0xd5,
+    0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4,
+    0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d,
+    0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43,
+    0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9,
+    0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc,
+    0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75,
+    0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2,
+    0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a,
+    0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30, 0x68, 0xe5,
+    0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34,
+    0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba,
+    0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8,
+    0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d,
+    0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92, 0x11,
+    0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d,
+    0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f,
+    0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23,
+    0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84,
+    0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d,
+    0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d,
+    0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17,
+    0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f,
+    0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4,
+    0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e,
+    0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3,
+    0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b,
+    0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda,
+    0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72,
+    0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21,
+    0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10,
+    0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2,
+    0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2,
+    0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18,
+    0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79,
+    0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93,
+    0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1,
+    0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6,
+    0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad,
+    0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30, 0x68,
+    0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c,
+    0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39,
+    0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d,
+    0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7,
+    0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92,
+    0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1,
+    0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c,
+    0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f,
+    0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91,
+    0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87,
+    0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29,
+    0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f,
+    0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f,
+    0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a,
+    0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06,
+    0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02,
+    0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53,
+    0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96,
+    0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb,
+    0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99,
+    0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa,
+    0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15,
+    0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57,
+    0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9,
+    0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8,
+    0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62,
+    0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6,
+    0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6,
+    0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9,
+    0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30,
+    0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0,
+    0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25,
+    0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69,
+    0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc,
+    0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59,
+    0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a,
+    0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21,
+    0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95,
+    0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec,
+    0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf,
+    0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6,
+    0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea,
+    0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c,
+    0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc,
+    0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3,
+    0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae,
+    0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92,
+    0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56,
+    0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b,
+    0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5,
+    0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2,
+    0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42,
+    0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69,
+    0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e,
+    0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a,
+    0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a,
+    0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e,
+    0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21,
+    0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec,
+    0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee,
+    0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba,
+    0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79,
+    0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5,
+    0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72,
+    0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a,
+    0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd,
+    0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4,
+    0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96,
+    0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08,
+    0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86,
+    0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17,
+    0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91,
+    0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2,
+    0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce,
+    0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e,
+    0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b,
+    0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7,
+    0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a,
+    0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47,
+    0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61,
+    0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd,
+    0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d,
+    0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9,
+    0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90,
+    0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08,
+    0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1,
+    0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9,
+    0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c,
+    0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c,
+    0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49,
+    0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78,
+    0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b,
+    0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6,
+    0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34,
+    0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16,
+    0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c,
+    0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6,
+    0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b,
+    0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9,
+    0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50,
+    0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae,
+    0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf,
+    0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48,
+    0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3,
+    0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14,
+    0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37,
+    0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37,
+    0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d,
+    0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83,
+    0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01,
+    0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29,
+    0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b,
+    0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5,
+    0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc,
+    0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55,
+    0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a,
+    0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab,
+    0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64,
+    0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c,
+    0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31,
+    0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53,
+    0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3,
+    0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64,
+    0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18,
+    0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70,
+    0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92,
+    0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4,
+    0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e,
+    0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c,
+    0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95,
+    0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10,
+    0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a,
+    0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76,
+    0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57,
+    0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3,
+    0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5,
+    0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e,
+    0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66,
+    0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71,
+    0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7,
+    0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9,
+    0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b,
+    0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95,
+    0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2,
+    0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9,
+    0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1,
+    0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4,
+    0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47,
+    0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35,
+    0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd,
+    0x31, 0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f,
+    0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10,
+    0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76,
+    0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7,
+    0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d,
+    0x70, 0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc,
+    0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52,
+    0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39,
+    0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d,
+    0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e,
+    0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a,
+    0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb,
+    0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84,
+    0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43,
+    0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b,
+    0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8,
+    0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61,
+    0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7,
+    0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f,
+    0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5,
+    0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb,
+    0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5,
+    0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3,
+    0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0,
+    0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6,
+    0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6,
+    0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc,
+    0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48,
+    0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84,
+    0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70,
+    0xbd, 0x31, 0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc,
+    0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46,
+    0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e,
+    0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4,
+    0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc,
+    0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd,
+    0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b,
+    0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a,
+    0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b,
+    0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e,
+    0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b,
+    0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d,
+    0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64,
+    0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8,
+    0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57,
+    0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f,
+    0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24,
+    0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1,
+    0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a,
+    0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b,
+    0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b,
+    0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26,
+    0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1,
+    0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80,
+    0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94,
+    0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5,
+    0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2,
+    0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66,
+    0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x75, 0x80,
+};
+static_assert(sizeof(kBytesTestReadSymbol5) == kNumBytesTestReadSymbol5, "");
+
+// The kBytesTestReadSymbol6[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][7] = {
+//   // pmf: 1/6, 1/6, 1/6, 1/6, 1/6, 1/6
+//   { 32768 - 5461, 32768 - 10923, 32768 - 16384, 32768 - 21845, 32768 - 27307,
+//     0, 0 },
+//   // pmf: 3/12, 2/12, 2/12, 2/12, 2/12, 1/12
+//   { 32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576, 32768 - 30037,
+//     0, 0 },
+//   // pmf: 1/12, 2/12, 2/12, 2/12, 2/12, 3/12
+//   { 32768 - 2731, 32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576,
+//     0, 0 },
+//   // pmf: 1/12, 2/12, 3/12, 3/12, 2/12, 1/12
+//   { 32768 - 2731, 32768 - 8192, 32768 - 16384, 32768 - 24576, 32768 - 30037,
+//     0, 0 },
+// };
+// constexpr int kSymbols[12][4] = { { 0, 0, 5, 5 },  //
+//                                   { 0, 1, 4, 4 },  //
+//                                   { 1, 2, 3, 3 },  //
+//                                   { 1, 3, 2, 2 },  //
+//                                   { 2, 4, 1, 1 },  //
+//                                   { 2, 5, 0, 0 },  //
+//                                   { 3, 0, 5, 4 },  //
+//                                   { 3, 1, 4, 3 },  //
+//                                   { 4, 2, 3, 2 },  //
+//                                   { 4, 3, 2, 1 },  //
+//                                   { 5, 4, 1, 3 },  //
+//                                   { 5, 0, 5, 2 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 256; ++i) {
+//   for (int j = 0; j < 12; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 6);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol6 = 3917;
+constexpr uint8_t kBytesTestReadSymbol6[] = {
+    0x0a, 0x8e, 0xb8, 0x15, 0xd5, 0x69, 0x63, 0x06, 0x48, 0x75, 0xf4, 0x4c,
+    0xfa, 0x13, 0xba, 0x68, 0x61, 0xa6, 0x9f, 0x39, 0x63, 0xba, 0x63, 0x26,
+    0xa8, 0xaa, 0xd0, 0x10, 0x4a, 0x05, 0xaf, 0x5f, 0x65, 0x57, 0x2f, 0x68,
+    0x48, 0x2c, 0x64, 0xdf, 0x0a, 0x93, 0xcc, 0x84, 0x43, 0x97, 0x34, 0x79,
+    0x10, 0x05, 0x4d, 0x58, 0xe9, 0xc3, 0xb4, 0x4a, 0x70, 0xd4, 0x81, 0x71,
+    0x9f, 0x6b, 0x18, 0xb3, 0x72, 0xdf, 0x37, 0x87, 0x3e, 0x40, 0xd0, 0xff,
+    0x10, 0x32, 0x22, 0xe4, 0x36, 0xef, 0xa2, 0x5e, 0x39, 0x5d, 0x42, 0x59,
+    0x8c, 0x3f, 0x1b, 0x41, 0xdb, 0xc2, 0x8c, 0x64, 0xaf, 0xd2, 0x49, 0x45,
+    0xd8, 0xad, 0x85, 0x3b, 0x70, 0x13, 0x83, 0x63, 0x49, 0x86, 0x35, 0xfe,
+    0x93, 0x6b, 0x51, 0x0e, 0x32, 0x3d, 0xf0, 0x30, 0xe0, 0xf5, 0x42, 0x59,
+    0x33, 0x8e, 0x63, 0x62, 0x46, 0x00, 0x69, 0x06, 0x52, 0x83, 0x37, 0x0b,
+    0x37, 0x12, 0x38, 0x3b, 0x9c, 0xc3, 0x00, 0xed, 0x0a, 0xd4, 0xed, 0x69,
+    0x01, 0xc5, 0x3a, 0x14, 0x29, 0xaf, 0x3e, 0x9c, 0x0a, 0xaf, 0x56, 0x50,
+    0x56, 0xcd, 0xa1, 0xb0, 0x88, 0xef, 0xa7, 0x57, 0xe6, 0xe8, 0x2c, 0x42,
+    0x60, 0x55, 0x22, 0x1f, 0xcc, 0x50, 0xa9, 0xda, 0xc2, 0x73, 0x19, 0x2e,
+    0xfb, 0x74, 0x88, 0x42, 0x0d, 0x49, 0x12, 0x5e, 0x36, 0x43, 0xe7, 0x33,
+    0x00, 0x7d, 0xd5, 0x35, 0xa3, 0xaf, 0x1e, 0x93, 0x5e, 0xe6, 0xae, 0x23,
+    0x41, 0x55, 0x05, 0x19, 0xde, 0xa7, 0xf1, 0x07, 0xbd, 0x58, 0xc1, 0x10,
+    0x0a, 0x4b, 0x5c, 0xee, 0xe3, 0xfb, 0xe5, 0xf5, 0xfc, 0x1a, 0x4e, 0x51,
+    0xda, 0x3e, 0xc5, 0x36, 0xda, 0x3e, 0x83, 0xfd, 0x6b, 0x6f, 0x54, 0xdb,
+    0x68, 0x5a, 0x9c, 0x46, 0xbf, 0x86, 0x23, 0xf1, 0xbd, 0xe1, 0x79, 0x5e,
+    0xf7, 0x1c, 0xe0, 0xf7, 0xa6, 0xd5, 0x9f, 0x0b, 0x74, 0xd8, 0xf2, 0x0a,
+    0x97, 0x71, 0xa2, 0xd2, 0x37, 0x05, 0x7e, 0x3e, 0xa4, 0xec, 0x16, 0x92,
+    0x37, 0xdd, 0x45, 0x0c, 0x17, 0x42, 0xf0, 0x34, 0xf7, 0x38, 0x04, 0xdf,
+    0xb8, 0xb4, 0xd6, 0xa0, 0x2c, 0x56, 0x96, 0x10, 0x30, 0x34, 0x10, 0x39,
+    0x9e, 0x95, 0x3b, 0x13, 0xf3, 0x60, 0xa1, 0x48, 0xca, 0x9f, 0x91, 0xfe,
+    0x42, 0xfb, 0xdf, 0x37, 0xf8, 0x5d, 0x49, 0x82, 0x42, 0x4f, 0x90, 0xdf,
+    0xae, 0x32, 0x20, 0x9e, 0xb6, 0xcc, 0xa0, 0x30, 0x07, 0x15, 0x64, 0xb8,
+    0x56, 0x84, 0x1e, 0x16, 0xa3, 0x35, 0xad, 0x14, 0x9d, 0x62, 0x65, 0x0c,
+    0x77, 0x82, 0x74, 0x41, 0x9c, 0x68, 0x95, 0x03, 0x4f, 0xfc, 0x1c, 0xc7,
+    0xd6, 0xe6, 0xe7, 0xb3, 0x54, 0x66, 0x87, 0xb6, 0x41, 0x03, 0xe2, 0x20,
+    0xf7, 0xdb, 0x2a, 0x0a, 0x25, 0x20, 0x60, 0xdf, 0xfd, 0x9f, 0x5f, 0x2c,
+    0x72, 0x5f, 0x2b, 0xf4, 0x07, 0x9f, 0xf3, 0x8a, 0xde, 0xf0, 0x4f, 0x8a,
+    0xa7, 0x75, 0xe3, 0xe8, 0xc9, 0xa1, 0xa0, 0x01, 0xa1, 0x20, 0xc8, 0xfb,
+    0xf9, 0x91, 0xd2, 0x23, 0x4f, 0x6c, 0x53, 0x3b, 0x12, 0x01, 0xac, 0x1f,
+    0x89, 0x84, 0x98, 0xcd, 0x3c, 0x74, 0x51, 0x92, 0xbe, 0x87, 0x06, 0x62,
+    0x49, 0xd2, 0x1b, 0x27, 0xfa, 0x28, 0xf8, 0xbd, 0xbb, 0x7a, 0x7d, 0xde,
+    0xa2, 0x9c, 0x1b, 0x7c, 0x80, 0xe8, 0xe0, 0x43, 0x64, 0xdd, 0x22, 0x7e,
+    0x2c, 0xe4, 0x79, 0x2e, 0xbd, 0x98, 0x1a, 0x59, 0x7e, 0xbe, 0xfd, 0x9e,
+    0x0c, 0x31, 0x50, 0x10, 0xdd, 0x62, 0x3c, 0x47, 0x9a, 0x11, 0x1b, 0x48,
+    0xf3, 0xd1, 0x2c, 0x1b, 0xc2, 0xb5, 0x57, 0x7c, 0xe5, 0x97, 0x6d, 0x78,
+    0xe7, 0xa2, 0xd6, 0x57, 0x61, 0x95, 0xed, 0x8d, 0xda, 0xc6, 0xdf, 0x2c,
+    0x1d, 0x48, 0xee, 0x53, 0xd8, 0x1e, 0x80, 0x41, 0xce, 0x58, 0x08, 0x96,
+    0x6f, 0x82, 0x6e, 0x28, 0x6a, 0x5a, 0x2b, 0x4f, 0x02, 0x4d, 0x99, 0x32,
+    0xea, 0x60, 0xce, 0x75, 0x57, 0x0c, 0x63, 0xf0, 0xda, 0x51, 0x1d, 0xcc,
+    0xb8, 0x21, 0x35, 0x10, 0x56, 0xaf, 0x80, 0xb3, 0x0f, 0x17, 0x29, 0x0c,
+    0x16, 0x07, 0x66, 0xe9, 0xcb, 0x52, 0xcd, 0xec, 0xb1, 0x79, 0xf8, 0xb9,
+    0x05, 0x08, 0xa1, 0xd7, 0x03, 0x6f, 0x8e, 0x9a, 0x6e, 0xfb, 0x38, 0x3a,
+    0xff, 0xa7, 0xa1, 0xd8, 0xb1, 0x56, 0x06, 0xde, 0xb1, 0xe7, 0x47, 0xc2,
+    0xc2, 0xab, 0xa9, 0x5f, 0x01, 0x65, 0x5d, 0x4c, 0xac, 0xd8, 0x1c, 0xfd,
+    0x2d, 0x55, 0x74, 0x8a, 0x2b, 0x41, 0x2d, 0x50, 0x0c, 0x9c, 0x64, 0xb2,
+    0xed, 0xaf, 0x2a, 0xb4, 0x58, 0x93, 0xd8, 0xc2, 0xab, 0x04, 0x45, 0xfc,
+    0xd7, 0x02, 0x1e, 0x14, 0xd4, 0x38, 0xba, 0x24, 0x07, 0x9a, 0x25, 0x52,
+    0x13, 0xe1, 0xe4, 0x26, 0x66, 0x12, 0xba, 0x13, 0x11, 0x25, 0xea, 0x29,
+    0xc5, 0xff, 0x34, 0xca, 0x18, 0x34, 0x97, 0x4a, 0x92, 0x00, 0xe8, 0x61,
+    0x18, 0x85, 0x0b, 0x56, 0x83, 0x48, 0xf9, 0xdb, 0x26, 0x7b, 0x54, 0xc8,
+    0xd2, 0x63, 0x1e, 0x7b, 0x25, 0x3c, 0x4a, 0xa6, 0xda, 0x10, 0x92, 0xca,
+    0x8a, 0x2c, 0x89, 0x60, 0x8e, 0xda, 0xf2, 0xab, 0x45, 0x89, 0x3d, 0x8c,
+    0x2d, 0x35, 0xda, 0xc1, 0x7c, 0x3d, 0x05, 0x8e, 0xad, 0x5b, 0xff, 0x7d,
+    0x46, 0x7b, 0x74, 0x71, 0xec, 0x05, 0x9a, 0x85, 0xa4, 0x4f, 0xc3, 0x54,
+    0x64, 0x90, 0xe5, 0x97, 0x89, 0x1a, 0xb0, 0x56, 0x30, 0x13, 0xda, 0x44,
+    0x2c, 0xb0, 0x50, 0x0c, 0x64, 0x43, 0x4a, 0xd2, 0x2a, 0xb4, 0x8f, 0x9d,
+    0xa6, 0xe5, 0x3c, 0x0c, 0x7a, 0x44, 0xb3, 0xeb, 0xa7, 0x92, 0xe5, 0x59,
+    0xa6, 0x43, 0xe9, 0x2b, 0x1f, 0x69, 0x4a, 0xc4, 0x89, 0xe7, 0xe0, 0x04,
+    0x9f, 0x1d, 0x33, 0x61, 0xe8, 0xab, 0x75, 0x8d, 0x30, 0xd6, 0x7c, 0xca,
+    0x02, 0xbe, 0xf9, 0x1d, 0x02, 0x4e, 0x0f, 0x88, 0xc9, 0x3f, 0x54, 0x9d,
+    0x93, 0x0d, 0x44, 0xf8, 0xf6, 0xa7, 0x1a, 0xb6, 0x8b, 0xf5, 0x14, 0xca,
+    0xbd, 0x6c, 0x2d, 0x9e, 0xfa, 0x80, 0x36, 0x53, 0x06, 0xac, 0x39, 0x0f,
+    0x6b, 0xdb, 0x2e, 0xe0, 0x4f, 0xf0, 0xa4, 0x44, 0x5a, 0xbb, 0xaa, 0x72,
+    0x59, 0x3f, 0x58, 0x38, 0xe5, 0x5c, 0x76, 0x31, 0xe6, 0xfe, 0x08, 0x20,
+    0xbe, 0x3f, 0xea, 0x00, 0x0d, 0x34, 0xd9, 0x4d, 0x06, 0x0a, 0xb5, 0x04,
+    0x7b, 0x48, 0x22, 0xa9, 0x94, 0x47, 0x44, 0xfd, 0x65, 0x81, 0x45, 0x56,
+    0x91, 0xf3, 0xb4, 0xdc, 0xa7, 0x6e, 0xb1, 0xa4, 0xc5, 0xd6, 0x81, 0x6a,
+    0x78, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d,
+    0xa7, 0xa7, 0xf2, 0x17, 0x92, 0x06, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12,
+    0x7f, 0x99, 0x40, 0x57, 0xdf, 0x23, 0xa0, 0x49, 0xc1, 0xf1, 0x19, 0x27,
+    0xea, 0x93, 0xb2, 0x61, 0xa8, 0x9f, 0x1e, 0xd4, 0xe3, 0x56, 0xd1, 0x7e,
+    0xa2, 0x99, 0x57, 0xad, 0x85, 0xb3, 0xdf, 0x50, 0x06, 0xca, 0x60, 0xd5,
+    0x87, 0x21, 0xed, 0x7b, 0x65, 0xdc, 0x09, 0xfe, 0x14, 0x88, 0x8b, 0x57,
+    0x75, 0x4e, 0x4b, 0x27, 0xeb, 0x07, 0x1c, 0xab, 0x8e, 0xc6, 0x3c, 0xdf,
+    0xc1, 0x04, 0x17, 0xc7, 0xfd, 0x40, 0x01, 0xa6, 0x9b, 0x29, 0xa0, 0xc1,
+    0x56, 0xa0, 0x8f, 0x69, 0x04, 0x55, 0x32, 0x88, 0xe8, 0x9f, 0x8d, 0x2b,
+    0x48, 0xaa, 0xd2, 0x3e, 0x76, 0x9b, 0x94, 0xed, 0xd6, 0x34, 0x98, 0xba,
+    0x16, 0x3c, 0x29, 0xce, 0x3d, 0x14, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa,
+    0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59,
+    0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5,
+    0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46,
+    0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97,
+    0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04,
+    0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5,
+    0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a,
+    0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77,
+    0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc,
+    0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f,
+    0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4,
+    0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0,
+    0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57,
+    0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca,
+    0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a,
+    0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c,
+    0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84,
+    0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61,
+    0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4,
+    0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e,
+    0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f,
+    0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1,
+    0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c,
+    0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43,
+    0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f,
+    0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1,
+    0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda,
+    0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27,
+    0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c,
+    0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91,
+    0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3,
+    0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28,
+    0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7,
+    0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08,
+    0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c,
+    0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec,
+    0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99,
+    0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1,
+    0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23,
+    0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06,
+    0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42,
+    0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25,
+    0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa,
+    0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7,
+    0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42,
+    0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c,
+    0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8,
+    0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff,
+    0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a,
+    0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b,
+    0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe,
+    0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20,
+    0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55,
+    0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde,
+    0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02,
+    0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6,
+    0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12,
+    0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c,
+    0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c,
+    0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53,
+    0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d,
+    0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb,
+    0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0,
+    0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7,
+    0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54,
+    0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2,
+    0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb,
+    0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d,
+    0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e,
+    0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09,
+    0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb,
+    0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94,
+    0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef,
+    0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99,
+    0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f,
+    0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48,
+    0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0,
+    0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf,
+    0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94,
+    0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95,
+    0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19,
+    0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08,
+    0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2,
+    0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69,
+    0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd,
+    0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f,
+    0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2,
+    0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18,
+    0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87,
+    0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff,
+    0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42,
+    0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4,
+    0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f,
+    0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9,
+    0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22,
+    0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6,
+    0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50,
+    0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf,
+    0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10,
+    0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39,
+    0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8,
+    0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32,
+    0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63,
+    0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46,
+    0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d,
+    0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85,
+    0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a,
+    0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55,
+    0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce,
+    0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84,
+    0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19,
+    0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0,
+    0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe,
+    0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94,
+    0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37,
+    0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc,
+    0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41,
+    0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa,
+    0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd,
+    0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05,
+    0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc,
+    0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24,
+    0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78,
+    0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18,
+    0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6,
+    0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb,
+    0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96,
+    0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40,
+    0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e,
+    0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8,
+    0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64,
+    0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97,
+    0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a,
+    0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d,
+    0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12,
+    0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97,
+    0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29,
+    0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde,
+    0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32,
+    0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e,
+    0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90,
+    0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41,
+    0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e,
+    0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29,
+    0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b,
+    0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32,
+    0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10,
+    0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84,
+    0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2,
+    0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba,
+    0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e,
+    0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84,
+    0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30,
+    0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e,
+    0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff,
+    0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84,
+    0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69,
+    0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f,
+    0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2,
+    0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45,
+    0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d,
+    0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0,
+    0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e,
+    0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21,
+    0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73,
+    0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0,
+    0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65,
+    0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7,
+    0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c,
+    0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a,
+    0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a,
+    0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95,
+    0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab,
+    0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c,
+    0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08,
+    0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32,
+    0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0,
+    0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc,
+    0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29,
+    0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e,
+    0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9,
+    0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83,
+    0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54,
+    0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a,
+    0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a,
+    0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99,
+    0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49,
+    0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1,
+    0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30,
+    0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c,
+    0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6,
+    0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d,
+    0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81,
+    0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c,
+    0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51,
+    0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8,
+    0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f,
+    0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34,
+    0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb,
+    0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24,
+    0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f,
+    0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52,
+    0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc,
+    0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65,
+    0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc,
+    0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21,
+    0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83,
+    0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd,
+    0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53,
+    0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56,
+    0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64,
+    0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20,
+    0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08,
+    0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4,
+    0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75,
+    0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c,
+    0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08,
+    0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61,
+    0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d,
+    0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff,
+    0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09,
+    0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3,
+    0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f,
+    0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4,
+    0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a,
+    0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b,
+    0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40,
+    0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c,
+    0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42,
+    0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7,
+    0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61,
+    0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca,
+    0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f,
+    0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19,
+    0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34,
+    0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14,
+    0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a,
+    0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56,
+    0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39,
+    0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11,
+    0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65,
+    0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41,
+    0x27, 0xf9, 0x94, 0x05, 0xa0,
+};
+static_assert(sizeof(kBytesTestReadSymbol6) == kNumBytesTestReadSymbol6, "");
+
+// The kBytesTestReadSymbol7[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][8] = {
+//   // pdf: 1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7
+//   { 32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+//     32768 - 23406, 32768 - 28087, 0, 0 },
+//   // pdf: 3/14, 2/14, 2/14, 2/14, 2/14, 2/14, 1/14
+//   { 32768 - 7022, 32768 - 11703, 32768 - 16384, 32768 - 21065,
+//     32768 - 25746, 32768 - 30427, 0, 0 },
+//   // pdf: 1/14, 1/14, 2/14, 2/14, 2/14, 3/14, 3/14
+//   { 32768 - 2341, 32768 - 4681, 32768 - 9362, 32768 - 14043,
+//     32768 - 18725, 32768 - 25746, 0, 0 },
+//   // pdf: 1/14, 2/14, 3/14, 3/14, 2/14, 2/14, 1/14
+//   { 32768 - 2341, 32768 - 7022, 32768 - 14043, 32768 - 21065,
+//     32768 - 25746, 32768 - 30427, 0, 0 },
+// };
+// constexpr int kSymbols[14][4] = { { 0, 4, 6, 3 },  //
+//                                   { 1, 5, 5, 2 },  //
+//                                   { 2, 6, 4, 1 },  //
+//                                   { 3, 0, 3, 0 },  //
+//                                   { 4, 1, 2, 6 },  //
+//                                   { 5, 2, 1, 5 },  //
+//                                   { 6, 3, 0, 4 },  //
+//                                   { 0, 0, 6, 5 },  //
+//                                   { 2, 1, 4, 3 },  //
+//                                   { 4, 3, 6, 1 },  //
+//                                   { 6, 5, 2, 4 },  //
+//                                   { 1, 0, 5, 2 },  //
+//                                   { 3, 2, 3, 2 },  //
+//                                   { 5, 4, 5, 3 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 14; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 7);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol7 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol7[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol7 = 19874;
+constexpr uint8_t kBytesTestReadSymbol7[] = {
+    0x1c, 0x6a, 0xfc, 0x4b, 0xd1, 0xb5, 0x8c, 0x20, 0x72, 0x45, 0x48, 0x21,
+    0x9e, 0x71, 0xe8, 0xc4, 0x91, 0x51, 0xab, 0xfd, 0x9c, 0x61, 0xf7, 0x98,
+    0xd4, 0x87, 0x71, 0xe6, 0x23, 0x37, 0x7e, 0xa3, 0xe0, 0x83, 0x48, 0x2e,
+    0xfe, 0xc3, 0xcb, 0x4f, 0x26, 0x9a, 0xd7, 0xe4, 0xca, 0xf4, 0x94, 0xb7,
+    0xbc, 0x03, 0xc9, 0xc3, 0x5e, 0x7f, 0xef, 0x9b, 0x37, 0xff, 0x8f, 0x62,
+    0xec, 0xb6, 0x09, 0x50, 0xa9, 0xc1, 0x4a, 0x97, 0xf4, 0xe7, 0x08, 0x57,
+    0x87, 0x2d, 0x10, 0xca, 0xbc, 0x93, 0x85, 0xfb, 0xc8, 0xc7, 0x8f, 0xc1,
+    0x4e, 0x1f, 0x50, 0xad, 0xba, 0x09, 0x9c, 0xf8, 0x94, 0x75, 0xdd, 0x2c,
+    0x78, 0x5d, 0xa0, 0x4a, 0xf3, 0x7b, 0xc0, 0xa7, 0x71, 0xa5, 0x20, 0xe6,
+    0xb0, 0xca, 0x09, 0xf2, 0x38, 0xfc, 0x61, 0x49, 0xdc, 0x83, 0x35, 0x1e,
+    0xdd, 0x08, 0xd7, 0xaa, 0x50, 0x0e, 0xc5, 0x57, 0x05, 0x44, 0xd7, 0xdb,
+    0x56, 0x2b, 0x1e, 0xe5, 0x33, 0x08, 0x7c, 0x3d, 0x25, 0x29, 0x05, 0x14,
+    0x3a, 0x93, 0xff, 0xe7, 0x40, 0x25, 0x30, 0x17, 0xc3, 0x50, 0xad, 0xec,
+    0xb3, 0x64, 0x87, 0x35, 0xb2, 0x5a, 0x1e, 0xa9, 0x48, 0xc8, 0x53, 0x30,
+    0xf1, 0x43, 0x6f, 0xe1, 0x2a, 0x8b, 0x81, 0x49, 0xbc, 0xa8, 0x8a, 0x8b,
+    0x2d, 0x1a, 0xc5, 0xcb, 0x47, 0xc1, 0xbc, 0xe0, 0x54, 0x98, 0xcc, 0x82,
+    0xe9, 0xa6, 0x3f, 0x70, 0x55, 0xe3, 0xe0, 0x7d, 0x5f, 0xa9, 0xc4, 0xc1,
+    0x62, 0x04, 0x2d, 0x15, 0xce, 0xab, 0x7c, 0xd9, 0x88, 0xc1, 0x67, 0x88,
+    0x3d, 0x6e, 0x96, 0x03, 0x6f, 0xa7, 0x6a, 0xc2, 0x6f, 0x20, 0x8c, 0xf4,
+    0xfb, 0x96, 0x0c, 0xb7, 0x14, 0xef, 0xa6, 0x83, 0xbd, 0x2b, 0x07, 0x8a,
+    0x2a, 0x66, 0xb8, 0x0d, 0xa8, 0x72, 0x2a, 0x78, 0x90, 0x2a, 0xe4, 0x46,
+    0x71, 0x8c, 0xcb, 0xcb, 0xbd, 0xfb, 0xc7, 0xa8, 0x9e, 0x9b, 0x6e, 0x6d,
+    0x2b, 0xc2, 0x1c, 0xea, 0x16, 0x3a, 0x06, 0xc0, 0xbc, 0xd7, 0x30, 0x8d,
+    0x87, 0x03, 0x04, 0x0d, 0x58, 0x58, 0x7b, 0x40, 0xf5, 0xe5, 0x7a, 0x51,
+    0x80, 0x7a, 0x16, 0xc2, 0xaf, 0x83, 0x43, 0x16, 0xb3, 0x3a, 0x1b, 0x24,
+    0x29, 0x80, 0x60, 0xee, 0x00, 0x91, 0x15, 0xdb, 0x28, 0x0d, 0xc2, 0xfb,
+    0x74, 0x48, 0xd9, 0x54, 0x97, 0x66, 0xa4, 0xba, 0xc8, 0x19, 0xff, 0x25,
+    0xca, 0xdf, 0x09, 0x66, 0xe4, 0xfe, 0xbb, 0x2b, 0x3f, 0x4a, 0x81, 0x5a,
+    0xa6, 0x54, 0x5c, 0xf0, 0xe4, 0x49, 0x38, 0x13, 0xfb, 0xa2, 0xee, 0xf9,
+    0x7d, 0x72, 0xa9, 0x37, 0x12, 0xf4, 0x04, 0x4e, 0x50, 0x19, 0x6f, 0x29,
+    0x9d, 0x0d, 0xe7, 0xc3, 0x6d, 0x65, 0x0b, 0x04, 0x53, 0x57, 0x0c, 0xb5,
+    0x71, 0xb4, 0xd6, 0xb0, 0xaa, 0xed, 0x38, 0x9e, 0x58, 0x55, 0x0d, 0xe4,
+    0xe6, 0x43, 0x16, 0x93, 0x46, 0x73, 0x39, 0x87, 0xaa, 0x69, 0x07, 0x9f,
+    0xd7, 0xb6, 0x77, 0x7d, 0xef, 0xc7, 0x19, 0x5d, 0x4f, 0x60, 0x20, 0x7e,
+    0xf0, 0x34, 0xbe, 0xe4, 0x31, 0xf3, 0x72, 0xe0, 0x89, 0xfb, 0xc8, 0x0a,
+    0xa9, 0xe6, 0x2c, 0x6b, 0xa5, 0xaa, 0xd5, 0x42, 0x69, 0xc0, 0x27, 0x3b,
+    0x17, 0x98, 0x73, 0xa3, 0x66, 0x10, 0xd7, 0xac, 0xf9, 0x7f, 0xb2, 0xf3,
+    0x38, 0x45, 0x23, 0xe2, 0xd4, 0xd2, 0x63, 0x1c, 0x84, 0xde, 0x25, 0xd4,
+    0x3c, 0x76, 0x58, 0x1a, 0xb6, 0x07, 0x22, 0x74, 0xc2, 0xf7, 0x2c, 0xe1,
+    0xc0, 0x51, 0x8c, 0xfa, 0xde, 0x6b, 0x35, 0x8c, 0x0f, 0x45, 0xf8, 0x5e,
+    0x61, 0x2d, 0x4e, 0x90, 0x2d, 0xb7, 0x6c, 0xaf, 0x71, 0x72, 0xdf, 0x68,
+    0xa9, 0xa2, 0x36, 0x79, 0xbd, 0xee, 0x88, 0xb0, 0xc8, 0xc9, 0xa6, 0x7e,
+    0x8e, 0xe8, 0x16, 0xbc, 0xd6, 0x82, 0x54, 0xac, 0x81, 0x42, 0x0f, 0xc9,
+    0x38, 0xd2, 0xe1, 0x17, 0x17, 0x4f, 0xc9, 0x0c, 0x39, 0xc0, 0x70, 0xd8,
+    0xd8, 0x17, 0x37, 0x4a, 0x93, 0x40, 0x83, 0xe3, 0x3f, 0x05, 0x25, 0xab,
+    0x6e, 0x58, 0xc1, 0x30, 0x62, 0x4d, 0xad, 0xcd, 0x1b, 0x7a, 0x4b, 0x08,
+    0xf8, 0x69, 0x85, 0xf1, 0x10, 0x84, 0x22, 0x54, 0x3a, 0x0c, 0x2d, 0x1b,
+    0xcd, 0x2d, 0xed, 0x95, 0x63, 0x1a, 0x9e, 0xbc, 0xb8, 0x76, 0x48, 0x65,
+    0xd1, 0xa6, 0x22, 0x98, 0x3e, 0xda, 0x00, 0x56, 0xf4, 0xd3, 0xc5, 0xb0,
+    0xb3, 0xb0, 0xfa, 0x0c, 0x84, 0x43, 0xfb, 0xa1, 0x1a, 0xba, 0x23, 0xc6,
+    0x72, 0xea, 0x83, 0x96, 0xff, 0xfd, 0x0d, 0xba, 0x40, 0x32, 0x3e, 0x1a,
+    0x61, 0x7b, 0xd5, 0x50, 0xfe, 0x41, 0xc8, 0x67, 0x71, 0xb4, 0xff, 0x24,
+    0xf8, 0x7b, 0xa2, 0x6d, 0x97, 0x84, 0x8e, 0x36, 0x30, 0x05, 0xc3, 0x60,
+    0x3b, 0x1c, 0xee, 0x34, 0x57, 0x05, 0x0f, 0x9e, 0xc2, 0xfd, 0xc8, 0x03,
+    0xab, 0x8a, 0x54, 0xde, 0x6a, 0x22, 0xa5, 0xb7, 0x38, 0xf5, 0x91, 0x08,
+    0xd4, 0xce, 0xe3, 0xa7, 0xb4, 0xcb, 0x58, 0x79, 0xe2, 0x34, 0x79, 0xfa,
+    0xc2, 0x85, 0x01, 0xeb, 0x53, 0xf1, 0xca, 0x5c, 0xa1, 0xfc, 0x35, 0xa2,
+    0x7b, 0x8f, 0x29, 0x1c, 0x67, 0xb0, 0x01, 0x1b, 0x5a, 0xa1, 0xc9, 0x3b,
+    0x2c, 0xc6, 0x35, 0xbb, 0x29, 0x46, 0x13, 0xfa, 0xd9, 0x40, 0x63, 0x3e,
+    0x6c, 0xa2, 0x36, 0x70, 0xe7, 0xc8, 0x76, 0x55, 0x70, 0xd2, 0x3f, 0xd1,
+    0xae, 0x83, 0x9d, 0xb9, 0x60, 0x47, 0x3e, 0x38, 0x0d, 0x08, 0x3f, 0xe0,
+    0x6b, 0x16, 0x7f, 0x7d, 0x7d, 0x40, 0x98, 0x99, 0xc1, 0x27, 0xf2, 0xb5,
+    0xfe, 0x33, 0xce, 0x83, 0x8c, 0x7d, 0xa7, 0xe6, 0xeb, 0x06, 0xdb, 0x4f,
+    0xca, 0x10, 0x82, 0x7b, 0x5e, 0xe8, 0xa9, 0x2e, 0xe0, 0x7a, 0xc2, 0x03,
+    0x75, 0x6e, 0x4e, 0x2b, 0xb6, 0xc3, 0x99, 0xf5, 0x41, 0xe9, 0x75, 0xe5,
+    0xc5, 0xae, 0x4f, 0xa8, 0x57, 0xf5, 0xf5, 0x89, 0x60, 0xae, 0x41, 0x13,
+    0x91, 0x77, 0x84, 0xb6, 0x79, 0xea, 0xcb, 0xeb, 0x8d, 0x05, 0xe2, 0x18,
+    0xfd, 0x36, 0x1f, 0x68, 0x34, 0xd1, 0x3c, 0xc3, 0xe1, 0x87, 0xd3, 0x2a,
+    0xb1, 0xc5, 0xac, 0xe2, 0xc3, 0xaf, 0xd1, 0x53, 0x61, 0x5e, 0xba, 0xcb,
+    0x32, 0xde, 0x97, 0xee, 0x4e, 0x58, 0xda, 0xda, 0x9d, 0x12, 0xe2, 0x75,
+    0x20, 0xd5, 0xb4, 0x64, 0x82, 0x75, 0x3e, 0xee, 0xb9, 0x13, 0x54, 0x54,
+    0x95, 0x36, 0x36, 0xa9, 0x85, 0x34, 0xa2, 0x37, 0xa0, 0x55, 0xe7, 0x1e,
+    0x9e, 0xb8, 0xbf, 0x36, 0x96, 0x1b, 0x1c, 0xa9, 0x16, 0xa9, 0x66, 0xb6,
+    0x30, 0x91, 0xc6, 0xfb, 0x51, 0x30, 0xc8, 0x19, 0x91, 0xca, 0x9e, 0x99,
+    0x88, 0x5a, 0x29, 0xbc, 0x10, 0x8e, 0x21, 0x93, 0x4b, 0xd1, 0x10, 0x10,
+    0x10, 0xca, 0x1a, 0x4d, 0x95, 0xd5, 0x0a, 0x08, 0xe4, 0xbc, 0xbc, 0xd4,
+    0xc4, 0x48, 0xaa, 0xb7, 0x55, 0x88, 0x55, 0x59, 0xfa, 0x05, 0x17, 0xae,
+    0x2f, 0xcd, 0xa5, 0x86, 0xc7, 0x2a, 0x45, 0xaa, 0x59, 0xad, 0x8c, 0x24,
+    0x71, 0xbe, 0xd4, 0x4c, 0x32, 0x06, 0x64, 0x72, 0xa7, 0xa6, 0x62, 0x16,
+    0x8a, 0x6f, 0x04, 0x23, 0x88, 0x64, 0xd2, 0xf4, 0x44, 0x04, 0x04, 0x32,
+    0x86, 0x93, 0x65, 0x75, 0x42, 0x82, 0x39, 0x2f, 0x2f, 0x35, 0x31, 0x12,
+    0x2a, 0xad, 0xd5, 0x62, 0x15, 0x56, 0x7e, 0x81, 0x48, 0x8e, 0xd3, 0x5e,
+    0x73, 0x9d, 0xa3, 0xec, 0xca, 0xdd, 0xbe, 0x89, 0xd7, 0xb8, 0xa3, 0x59,
+    0xeb, 0x97, 0xb3, 0xf2, 0xf1, 0xa6, 0x4b, 0x8e, 0x89, 0xe6, 0xe9, 0x0a,
+    0x84, 0x9b, 0xbf, 0xd3, 0x6b, 0xd5, 0xbf, 0x1e, 0x7f, 0x87, 0x55, 0x76,
+    0x5e, 0xa7, 0xe6, 0x3e, 0xcf, 0x6c, 0x16, 0x5f, 0xf1, 0xf6, 0xf0, 0x3e,
+    0xd4, 0x4f, 0x71, 0xe5, 0x23, 0x8c, 0xf6, 0xa6, 0x11, 0xc3, 0xf8, 0x7b,
+    0xc7, 0xea, 0x1a, 0x6a, 0xc7, 0x13, 0x2e, 0x5a, 0xf6, 0x61, 0x9b, 0x71,
+    0x61, 0x3b, 0x66, 0x37, 0xd4, 0x28, 0xa6, 0xbf, 0xd6, 0xc6, 0x2e, 0x29,
+    0xd6, 0x38, 0xb5, 0x9c, 0x58, 0x75, 0xfa, 0x2a, 0x6c, 0x2f, 0xa3, 0x8b,
+    0x02, 0xbe, 0xdd, 0x38, 0xdb, 0x4f, 0xca, 0x25, 0x43, 0x09, 0x44, 0x79,
+    0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x45, 0xaa, 0x53, 0x29, 0x8e, 0xd7, 0x81,
+    0x74, 0xdd, 0xfa, 0x65, 0x18, 0xd5, 0xc5, 0xae, 0x4f, 0xa8, 0x57, 0xf6,
+    0x04, 0xf5, 0xcd, 0xd8, 0xa0, 0x26, 0xb4, 0x41, 0xe3, 0x02, 0xc9, 0x95,
+    0xfe, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0xe6, 0x35, 0xff, 0x03,
+    0x5f, 0x8c, 0xac, 0x56, 0x1e, 0xec, 0x29, 0xfc, 0x45, 0x97, 0x61, 0x74,
+    0xa6, 0xed, 0x7c, 0x67, 0x7a, 0xf5, 0xdd, 0x80, 0xaf, 0x42, 0x04, 0x7f,
+    0x82, 0x46, 0x15, 0x56, 0xea, 0xb1, 0x0a, 0xab, 0x3f, 0x40, 0xa4, 0x47,
+    0x69, 0xaf, 0x39, 0xce, 0xd1, 0xf6, 0x65, 0x6e, 0xf0, 0x45, 0x5e, 0xfc,
+    0x51, 0xac, 0xf5, 0xcb, 0xd9, 0xf9, 0x78, 0xd3, 0x25, 0xc7, 0x44, 0xf3,
+    0x74, 0x85, 0x42, 0x4d, 0xdf, 0xe9, 0xb5, 0xea, 0xdf, 0x8f, 0x3f, 0xc3,
+    0xaa, 0xbb, 0x2f, 0x53, 0xf3, 0x1f, 0x67, 0xb6, 0x0b, 0x2f, 0xf8, 0xfb,
+    0x78, 0x1f, 0x6a, 0x27, 0xb8, 0xf2, 0x91, 0xc6, 0x7b, 0x53, 0x08, 0xe1,
+    0xfc, 0x3d, 0xe3, 0xf5, 0x0d, 0x35, 0x63, 0x89, 0x97, 0x2d, 0x7b, 0x30,
+    0xcd, 0xb8, 0xb0, 0x9d, 0xb3, 0x1b, 0xea, 0x14, 0x53, 0x5f, 0xeb, 0x63,
+    0x17, 0x14, 0xeb, 0x1c, 0x5a, 0xce, 0x2c, 0x3a, 0xfd, 0x15, 0x36, 0x17,
+    0xd1, 0xc5, 0x81, 0x5f, 0x6e, 0x9c, 0x6d, 0xa7, 0xe5, 0x12, 0xa1, 0x84,
+    0xa2, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x22, 0xd5, 0x29, 0x94, 0xc7,
+    0x6b, 0xc0, 0xba, 0x6e, 0xfd, 0x32, 0x8c, 0x6a, 0xe2, 0xd7, 0x27, 0xd4,
+    0x2b, 0xfb, 0x02, 0x7a, 0xe6, 0xec, 0x50, 0x13, 0x5a, 0x20, 0xf1, 0x81,
+    0x64, 0xca, 0xff, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x73, 0x1a,
+    0xff, 0x81, 0xaf, 0xc6, 0x56, 0x2b, 0x0f, 0x76, 0x14, 0xfe, 0x22, 0xcb,
+    0xb0, 0xba, 0x53, 0x76, 0xbe, 0x33, 0xbd, 0x7a, 0xee, 0xc0, 0x57, 0xa1,
+    0x02, 0x3f, 0xc1, 0x23, 0x0a, 0xab, 0x75, 0x58, 0x85, 0x55, 0x9f, 0xa0,
+    0x52, 0x23, 0xb4, 0xd7, 0x9c, 0xe7, 0x68, 0xfb, 0x32, 0xb7, 0x78, 0x22,
+    0xaf, 0x7e, 0x28, 0xd6, 0x7a, 0xe5, 0xec, 0xfc, 0xbc, 0x69, 0x92, 0xe3,
+    0xa2, 0x79, 0xba, 0x42, 0xa1, 0x26, 0xef, 0xf4, 0xda, 0xf5, 0x6f, 0xc7,
+    0x9f, 0xe1, 0xd5, 0x5d, 0x97, 0xa9, 0xf9, 0x8f, 0xb3, 0xdb, 0x05, 0x97,
+    0xfc, 0x7d, 0xbc, 0x0f, 0xb5, 0x13, 0xdc, 0x79, 0x48, 0xe3, 0x3d, 0xa9,
+    0x84, 0x70, 0xfe, 0x1e, 0xf1, 0xfa, 0x86, 0x9a, 0xb1, 0xc4, 0xcb, 0x96,
+    0xbd, 0x98, 0x66, 0xdc, 0x58, 0x4e, 0xd9, 0x8d, 0xf5, 0x0a, 0x29, 0xaf,
+    0xf5, 0xb1, 0x8b, 0x8a, 0x75, 0x8e, 0x2d, 0x67, 0x16, 0x1d, 0x7e, 0x8a,
+    0x9b, 0x0b, 0xe8, 0xe2, 0xc0, 0xaf, 0xb7, 0x4e, 0x36, 0xd3, 0xf2, 0x89,
+    0x50, 0xc2, 0x51, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x91, 0x6a, 0x94,
+    0xca, 0x63, 0xb5, 0xe0, 0x5d, 0x37, 0x7e, 0x99, 0x46, 0x35, 0x71, 0x6b,
+    0x93, 0xea, 0x15, 0xfd, 0x81, 0x3d, 0x73, 0x76, 0x28, 0x09, 0xad, 0x10,
+    0x78, 0xc0, 0xb2, 0x65, 0x7f, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70,
+    0x39, 0x8d, 0x7f, 0xc0, 0xd7, 0xe3, 0x2b, 0x15, 0x87, 0xbb, 0x0a, 0x7f,
+    0x11, 0x65, 0xd8, 0x5d, 0x29, 0xbb, 0x5f, 0x19, 0xde, 0xbd, 0x77, 0x60,
+    0x2b, 0xd0, 0x81, 0x1f, 0xe0, 0x91, 0x85, 0x55, 0xba, 0xac, 0x42, 0xaa,
+    0xcf, 0xd0, 0x29, 0x11, 0xda, 0x6b, 0xce, 0x73, 0xb4, 0x7d, 0x99, 0x5b,
+    0xbc, 0x11, 0x57, 0xbf, 0x14, 0x6b, 0x3d, 0x72, 0xf6, 0x7e, 0x5e, 0x34,
+    0xc9, 0x71, 0xd1, 0x3c, 0xdd, 0x21, 0x50, 0x93, 0x77, 0xfa, 0x6d, 0x7a,
+    0xb7, 0xe3, 0xcf, 0xf0, 0xea, 0xae, 0xe7, 0x1d, 0xfb, 0x2a, 0x2f, 0x0e,
+    0xe3, 0xde, 0xf4, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb3,
+};
+static_assert(sizeof(kBytesTestReadSymbol7) == kNumBytesTestReadSymbol7, "");
+
+// The kBytesTestReadSymbol8[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][9] = {
+//   // pdf: 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8
+//   { 32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+//     32768 - 20480, 32768 - 24576, 32768 - 28672, 0, 0 },
+//   // pdf: 3/16, 2/16, 2/16, 2/16, 2/16, 2/16, 2/16, 1/16
+//   { 32768 - 6144, 32768 - 10240, 32768 - 14336, 32768 - 18432,
+//     32768 - 22528, 32768 - 26624, 32768 - 30720, 0, 0 },
+//   // pdf: 1/16, 1/16, 2/16, 2/16, 2/16, 2/16, 3/16, 3/16
+//   { 32768 - 2048, 32768 - 4096, 32768 - 8192, 32768 - 12288,
+//     32768 - 16384, 32768 - 20480, 32768 - 26624, 0, 0 },
+//   // pdf: 1/16, 1/16, 3/16, 3/16, 3/16, 3/16, 1/16, 1/16
+//   { 32768 - 2048, 32768 - 4096, 32768 - 10240, 32768 - 16384,
+//     32768 - 22528, 32768 - 28672, 32768 - 30720, 0, 0 },
+// };
+// constexpr int kSymbols[16][4] = { { 0, 4, 7, 3 },  //
+//                                   { 1, 5, 6, 2 },  //
+//                                   { 2, 6, 5, 1 },  //
+//                                   { 3, 7, 4, 0 },  //
+//                                   { 4, 0, 3, 7 },  //
+//                                   { 5, 1, 2, 6 },  //
+//                                   { 6, 2, 1, 5 },  //
+//                                   { 7, 3, 0, 4 },  //
+//                                   { 0, 0, 6, 5 },  //
+//                                   { 2, 1, 4, 3 },  //
+//                                   { 4, 3, 6, 4 },  //
+//                                   { 6, 5, 2, 2 },  //
+//                                   { 1, 0, 7, 3 },  //
+//                                   { 3, 2, 5, 5 },  //
+//                                   { 5, 4, 7, 2 },  //
+//                                   { 7, 6, 3, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 16; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 8);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol8 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol8[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol8 = 24195;
+constexpr uint8_t kBytesTestReadSymbol8[] = {
+    0x15, 0x60, 0xa8, 0x52, 0xf4, 0x88, 0xdd, 0x23, 0x40, 0xb1, 0xd6, 0xd2,
+    0xc2, 0xa2, 0x4c, 0x0a, 0x5d, 0xba, 0xfe, 0xd2, 0x36, 0xd9, 0xcd, 0x51,
+    0x10, 0x25, 0x13, 0x29, 0xfa, 0x0d, 0x87, 0xf9, 0xd1, 0x6f, 0xf2, 0x0d,
+    0x3a, 0xbe, 0xd9, 0x83, 0x99, 0xd1, 0xdf, 0x24, 0x70, 0x28, 0xdb, 0x63,
+    0xf6, 0x7c, 0x07, 0x2b, 0x68, 0xa3, 0x7a, 0x85, 0xd1, 0x47, 0xba, 0x59,
+    0x18, 0x7e, 0x64, 0x3b, 0xac, 0xaf, 0xe3, 0x3a, 0x99, 0x82, 0x30, 0x92,
+    0x7a, 0x93, 0x67, 0x9f, 0xac, 0x53, 0xf8, 0xdb, 0x03, 0x71, 0xc7, 0x4a,
+    0xa9, 0xec, 0x10, 0xc9, 0xed, 0x5b, 0xa6, 0xd5, 0xc3, 0xdd, 0x81, 0x8d,
+    0x25, 0xbe, 0x57, 0xcd, 0x01, 0x65, 0x33, 0x6c, 0x12, 0xe1, 0x37, 0x8b,
+    0xf1, 0x08, 0x27, 0x3c, 0x5a, 0x30, 0x9f, 0x2d, 0x41, 0x2e, 0x75, 0x49,
+    0xab, 0xa6, 0xb6, 0x4c, 0xbe, 0xe0, 0xd0, 0x20, 0x74, 0xeb, 0x05, 0x79,
+    0x91, 0x60, 0xfd, 0xb2, 0x39, 0x54, 0xd9, 0x0c, 0x11, 0x04, 0x1f, 0x7b,
+    0x5d, 0x2d, 0xe3, 0x3f, 0x48, 0xe4, 0x56, 0x11, 0x3d, 0x48, 0xdb, 0x5c,
+    0x1c, 0x8b, 0x81, 0xbb, 0x8a, 0x53, 0xb7, 0x48, 0x5b, 0x15, 0x9b, 0x35,
+    0xc1, 0x18, 0x0f, 0xc3, 0x1e, 0x1c, 0x16, 0x7e, 0x0a, 0xbf, 0x16, 0x0a,
+    0xf5, 0x3f, 0xbe, 0x19, 0xc0, 0x0f, 0xa4, 0x59, 0xae, 0x0a, 0xcf, 0xf4,
+    0x00, 0xb2, 0xff, 0x3a, 0xd8, 0x7f, 0x6c, 0xcf, 0x4f, 0xca, 0xa1, 0x40,
+    0x47, 0x8e, 0xd0, 0x44, 0x49, 0x5a, 0x48, 0xe6, 0x86, 0x80, 0xbb, 0x57,
+    0x36, 0x6e, 0x80, 0xf1, 0xd1, 0xd8, 0xb8, 0xad, 0xb7, 0x6b, 0x11, 0x79,
+    0x02, 0x95, 0x20, 0xcf, 0x6f, 0x21, 0xe6, 0x5c, 0x65, 0x69, 0x4a, 0xf2,
+    0x6f, 0x87, 0x68, 0xf1, 0xda, 0x3b, 0xe1, 0x64, 0x5c, 0xfc, 0x21, 0x02,
+    0x7b, 0xf6, 0x39, 0x77, 0x36, 0x29, 0x3d, 0xda, 0x16, 0x2e, 0xdb, 0x55,
+    0xac, 0x5a, 0x3a, 0x94, 0x9c, 0x79, 0x2c, 0x92, 0xa4, 0xe3, 0xe2, 0x87,
+    0xd8, 0x14, 0x21, 0x76, 0xae, 0xf1, 0x8d, 0x7d, 0xdc, 0xde, 0x46, 0xd9,
+    0xbd, 0xb6, 0x5f, 0xae, 0x77, 0xd0, 0xd7, 0x01, 0xed, 0xbe, 0x5f, 0xee,
+    0x1a, 0x20, 0x0f, 0x88, 0x5c, 0x8a, 0x44, 0xad, 0x8f, 0x8f, 0x66, 0x9d,
+    0x43, 0xf4, 0x41, 0x0a, 0xa1, 0xc8, 0x5c, 0xbc, 0x37, 0xe2, 0xca, 0xd2,
+    0xd8, 0x27, 0x54, 0xdb, 0xdf, 0x7f, 0x0a, 0xd7, 0x65, 0x19, 0x99, 0x1a,
+    0x92, 0x53, 0xdd, 0x1e, 0x5f, 0xad, 0x24, 0x8a, 0x8d, 0x76, 0xc4, 0xf7,
+    0x7e, 0x74, 0xfe, 0x68, 0x99, 0x42, 0xfa, 0xaa, 0x6e, 0xdd, 0x91, 0xd4,
+    0x71, 0x10, 0xb7, 0x45, 0xa8, 0x5f, 0x84, 0x0d, 0xeb, 0x38, 0x3e, 0xaa,
+    0xf1, 0xad, 0x86, 0x8f, 0x1a, 0x3e, 0x9a, 0x29, 0xc7, 0x7b, 0xa7, 0xdf,
+    0x51, 0x3d, 0x49, 0x08, 0x09, 0x69, 0x40, 0x9d, 0x45, 0xb8, 0x55, 0xce,
+    0x96, 0x6c, 0x8b, 0xc6, 0xc9, 0x25, 0x70, 0xc9, 0xb3, 0xa8, 0xa8, 0x08,
+    0x33, 0x7b, 0xca, 0x21, 0x9e, 0x5b, 0xb5, 0x02, 0x7f, 0xa3, 0x34, 0x7c,
+    0x3d, 0xba, 0x91, 0x2e, 0xae, 0xc3, 0x1f, 0x9e, 0xc2, 0x4f, 0xdf, 0xa9,
+    0x39, 0x9b, 0x9d, 0x6e, 0xc7, 0x90, 0xeb, 0x2b, 0xb0, 0x3f, 0xde, 0x37,
+    0xb7, 0x94, 0x3d, 0x4b, 0x2c, 0x42, 0x3f, 0x47, 0xad, 0xc9, 0x23, 0xcb,
+    0x4d, 0xc4, 0xdd, 0x5e, 0x67, 0x11, 0x9d, 0x45, 0xb8, 0x55, 0xce, 0x98,
+    0x05, 0xce, 0x97, 0x99, 0x57, 0x84, 0x8d, 0x79, 0x97, 0x81, 0x4b, 0x8a,
+    0x9c, 0x76, 0x73, 0x9a, 0xf7, 0x59, 0x54, 0x07, 0x6c, 0x11, 0x41, 0x44,
+    0xf0, 0xa6, 0x2a, 0x5e, 0xb1, 0x48, 0x47, 0x39, 0xbb, 0x1b, 0xf0, 0x25,
+    0x07, 0xe7, 0xd2, 0xbb, 0x9b, 0x9b, 0xd7, 0x7e, 0xc8, 0xdd, 0xae, 0xb6,
+    0x23, 0x5e, 0xe0, 0xa5, 0xb0, 0xc6, 0xb6, 0x81, 0xe9, 0x51, 0x20, 0xe9,
+    0x2f, 0x89, 0xcd, 0x13, 0x96, 0x21, 0x19, 0xc5, 0xd1, 0x65, 0x65, 0x88,
+    0xd9, 0x7b, 0x87, 0xdc, 0xfb, 0x38, 0x54, 0x22, 0x27, 0xc4, 0xc4, 0x16,
+    0x56, 0xff, 0x76, 0x69, 0xa6, 0x3b, 0xa0, 0x6d, 0xab, 0xb8, 0xdf, 0xc1,
+    0xc2, 0xff, 0x65, 0x8f, 0x85, 0xbc, 0x69, 0xc0, 0xa5, 0x9a, 0xef, 0xf1,
+    0x37, 0x57, 0x99, 0xc4, 0x67, 0x51, 0x6e, 0xdf, 0x30, 0xa4, 0x86, 0x47,
+    0x34, 0x5f, 0x5e, 0x3c, 0xde, 0x6e, 0x96, 0x74, 0x5c, 0xbd, 0xca, 0xa3,
+    0x50, 0xe4, 0xe8, 0x63, 0xdf, 0xb0, 0xf1, 0xbe, 0xa2, 0x58, 0x23, 0x7a,
+    0x4a, 0x29, 0x62, 0x1f, 0x03, 0xf1, 0xe9, 0x19, 0xdd, 0x68, 0xe8, 0x1a,
+    0x7a, 0x9b, 0x40, 0x0d, 0xb0, 0x15, 0x8b, 0x14, 0x63, 0x08, 0xa4, 0x21,
+    0xa6, 0x0b, 0x34, 0x8a, 0x3e, 0x76, 0x7a, 0xa8, 0x11, 0x81, 0x16, 0x12,
+    0xa5, 0xc6, 0x7a, 0xf1, 0xa0, 0x20, 0xff, 0x33, 0x3b, 0xa5, 0x43, 0xc7,
+    0x42, 0xd3, 0x22, 0x90, 0x16, 0xa2, 0x28, 0x18, 0xa4, 0xc7, 0x24, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0xc0,
+};
+static_assert(sizeof(kBytesTestReadSymbol8) == kNumBytesTestReadSymbol8, "");
+
+// The kBytesTestReadSymbol9[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][10] = {
+//   // pmf: 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9
+//   { 32768 - 3641, 32768 - 7282, 32768 - 10923, 32768 - 14564, 32768 - 18204,
+//     32768 - 21845, 32768 - 25486, 32768 - 29127, 0, 0 },
+//   // pmf: 3/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 1/18
+//   { 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384, 32768 - 20025,
+//     32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0 },
+//   // pmf: 1/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 3/18
+//   { 32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+//     32768 - 20025, 32768 - 23666, 32768 - 27307, 0, 0 },
+//   // pmf: 1/18, 2/18, 2/18, 2/18, 4/18, 2/18, 2/18, 2/18, 1/18
+//   { 32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 20025,
+//     32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0 },
+// };
+// constexpr int kSymbols[18][4] = { { 0, 4, 8, 3 },  //
+//                                   { 1, 5, 7, 2 },  //
+//                                   { 2, 6, 6, 1 },  //
+//                                   { 3, 7, 5, 0 },  //
+//                                   { 4, 8, 4, 8 },  //
+//                                   { 5, 0, 3, 7 },  //
+//                                   { 6, 1, 2, 6 },  //
+//                                   { 7, 2, 1, 5 },  //
+//                                   { 8, 3, 0, 4 },  //
+//                                   { 0, 0, 8, 7 },  //
+//                                   { 2, 1, 6, 5 },  //
+//                                   { 4, 3, 4, 3 },  //
+//                                   { 6, 5, 2, 1 },  //
+//                                   { 8, 7, 7, 6 },  //
+//                                   { 1, 0, 5, 4 },  //
+//                                   { 3, 2, 3, 2 },  //
+//                                   { 5, 4, 1, 4 },  //
+//                                   { 7, 6, 8, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 128; ++i) {
+//   for (int j = 0; j < 18; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 9);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol9 = 3650;
+constexpr uint8_t kBytesTestReadSymbol9[] = {
+    0x10, 0xe6, 0x62, 0x17, 0x4c, 0x5e, 0xe0, 0x8c, 0x41, 0x75, 0x38, 0xda,
+    0xb6, 0x33, 0xc7, 0x0e, 0x0f, 0x62, 0x87, 0x29, 0xbe, 0x28, 0x8b, 0x81,
+    0x71, 0xab, 0x0d, 0xfe, 0x61, 0xf9, 0x96, 0x85, 0xfe, 0x78, 0x18, 0xe6,
+    0x57, 0xa7, 0xf0, 0xd3, 0xd5, 0x62, 0x37, 0x9a, 0x3d, 0xc4, 0xad, 0x75,
+    0x35, 0xc1, 0xe9, 0x63, 0xeb, 0x9c, 0xd3, 0xf4, 0xdb, 0xc0, 0xf3, 0x67,
+    0x14, 0xbd, 0xde, 0xf7, 0xd1, 0x51, 0xf1, 0x62, 0x28, 0xd5, 0x39, 0x99,
+    0x82, 0x5b, 0x9c, 0x3a, 0x37, 0x85, 0xe7, 0x48, 0x28, 0x02, 0x2d, 0xf1,
+    0x15, 0x55, 0x77, 0x02, 0x2e, 0x62, 0x53, 0xf6, 0x8a, 0x53, 0x44, 0xfa,
+    0xe0, 0xff, 0x05, 0xae, 0xdc, 0x30, 0xee, 0x36, 0x29, 0x80, 0xd5, 0x0a,
+    0xa6, 0x5f, 0x53, 0xa2, 0x31, 0xc0, 0x5b, 0x2a, 0xa5, 0xa5, 0xd2, 0xc0,
+    0x8d, 0x96, 0x66, 0x25, 0x93, 0x9e, 0xdc, 0x0b, 0x2f, 0xea, 0xe2, 0x51,
+    0x0b, 0x12, 0x87, 0x90, 0x79, 0xe7, 0x8e, 0x6f, 0xc6, 0x99, 0x4b, 0x6a,
+    0x50, 0x06, 0xf3, 0x3d, 0xf5, 0x25, 0x72, 0xc5, 0x9e, 0xab, 0x7b, 0x5b,
+    0x15, 0xf5, 0xeb, 0xae, 0x02, 0xe4, 0x90, 0x2b, 0x15, 0x66, 0xf7, 0x50,
+    0xfa, 0x46, 0x74, 0xae, 0xd4, 0x7f, 0xd4, 0x0b, 0xbf, 0xbc, 0x83, 0x60,
+    0x6f, 0x25, 0x87, 0xde, 0xce, 0xb3, 0x86, 0x5a, 0x13, 0x00, 0x31, 0xf2,
+    0x75, 0xca, 0x08, 0x71, 0xd2, 0xf4, 0xa9, 0xf9, 0x40, 0x23, 0xa7, 0x5e,
+    0x50, 0x63, 0x64, 0x1d, 0xa2, 0x50, 0x2f, 0x01, 0x4c, 0x11, 0x8b, 0xcb,
+    0x92, 0x40, 0x9d, 0x94, 0x50, 0x0a, 0xf5, 0x3b, 0xfc, 0x32, 0x1a, 0xbd,
+    0x48, 0x73, 0xe7, 0x93, 0x0f, 0x53, 0xb2, 0x8e, 0xac, 0xef, 0x22, 0x2f,
+    0x3e, 0xb0, 0x81, 0xc0, 0x06, 0x9b, 0x14, 0x5c, 0xa6, 0x16, 0xca, 0xa5,
+    0x79, 0xd2, 0x6a, 0xd3, 0xfe, 0x93, 0x33, 0x2f, 0xdb, 0xcb, 0xca, 0xb3,
+    0x1d, 0xc5, 0x56, 0x65, 0x53, 0x7f, 0xb9, 0x41, 0xe1, 0x54, 0x31, 0xa2,
+    0x8c, 0x92, 0xc8, 0x04, 0xf7, 0x9d, 0x26, 0xad, 0x35, 0x00, 0x5a, 0xb2,
+    0x78, 0x43, 0x14, 0xc2, 0xeb, 0x3a, 0x26, 0x4d, 0x49, 0x5d, 0x33, 0xe4,
+    0xa9, 0xea, 0xd3, 0x67, 0xbf, 0xbc, 0xb6, 0x2e, 0x1c, 0xf7, 0xd0, 0x98,
+    0x13, 0x0d, 0x7c, 0x94, 0x02, 0x28, 0x3e, 0x8a, 0xe5, 0x0c, 0x75, 0x82,
+    0xe5, 0x81, 0x98, 0x87, 0x88, 0x97, 0x86, 0xd6, 0x46, 0x2c, 0x9c, 0x85,
+    0xc2, 0x99, 0xfd, 0x0a, 0x68, 0xbf, 0x67, 0xfc, 0x17, 0xc7, 0x11, 0x54,
+    0xd1, 0x20, 0x9d, 0x83, 0x52, 0x84, 0x5d, 0x4b, 0x62, 0xbf, 0x16, 0x5d,
+    0x8e, 0x72, 0x46, 0xde, 0xb1, 0x77, 0xfb, 0x39, 0x98, 0xf0, 0x4d, 0xa6,
+    0x7a, 0x7d, 0x1c, 0x16, 0xe9, 0x1e, 0x86, 0x7e, 0xf9, 0x22, 0x58, 0x93,
+    0xea, 0x2e, 0x26, 0xc7, 0xfb, 0xd1, 0xb3, 0xc7, 0x99, 0xb1, 0x91, 0x67,
+    0xf1, 0xa3, 0xe0, 0xd2, 0xe8, 0x17, 0x17, 0xd7, 0x0b, 0x7a, 0xd4, 0xed,
+    0x9e, 0x72, 0x4e, 0xa2, 0x37, 0xc9, 0xd2, 0x16, 0x5d, 0x8b, 0xda, 0xdb,
+    0x5c, 0x46, 0x05, 0x3e, 0xf7, 0xc8, 0x3a, 0xd5, 0xaf, 0xd9, 0x72, 0x82,
+    0xbf, 0x96, 0xea, 0x09, 0xd3, 0xd5, 0xfe, 0x43, 0x24, 0xae, 0x95, 0x3d,
+    0x6c, 0x68, 0x54, 0xad, 0xb5, 0xc4, 0x60, 0x54, 0x08, 0x3c, 0x57, 0x61,
+    0xa1, 0x11, 0x21, 0x7f, 0xca, 0x48, 0x59, 0xb4, 0x1c, 0x39, 0x0d, 0xf2,
+    0xdc, 0x62, 0xf0, 0xbb, 0x95, 0x39, 0x51, 0xe9, 0xdb, 0xf1, 0x5d, 0xd1,
+    0x43, 0x83, 0x8a, 0xb1, 0x8d, 0x36, 0x39, 0x83, 0xc6, 0x94, 0x30, 0xbe,
+    0xb6, 0x2f, 0x39, 0x05, 0xad, 0xcd, 0xf9, 0x4c, 0xc2, 0x34, 0xc7, 0x81,
+    0x68, 0xb1, 0x20, 0x1d, 0xea, 0xd3, 0x8c, 0xca, 0xff, 0x4d, 0x94, 0xe1,
+    0x3e, 0xc2, 0x74, 0x90, 0xed, 0x56, 0x3c, 0x1b, 0x5b, 0xf6, 0x40, 0xf9,
+    0x3b, 0x94, 0x94, 0x23, 0xc6, 0x48, 0x6a, 0x59, 0xef, 0x04, 0xb7, 0x9f,
+    0x55, 0x9c, 0x6f, 0x81, 0x73, 0xec, 0x27, 0x49, 0x0e, 0xd5, 0x63, 0xc1,
+    0xb5, 0xbf, 0x64, 0x0f, 0x93, 0xb9, 0x49, 0x42, 0x3c, 0x64, 0x86, 0xa5,
+    0x9e, 0xf0, 0x4b, 0x79, 0xf5, 0x59, 0xc7, 0xc5, 0x01, 0x6f, 0xbd, 0x6a,
+    0x66, 0x93, 0x99, 0x47, 0xb6, 0xf7, 0xfa, 0x21, 0x72, 0x81, 0x71, 0x40,
+    0x36, 0x81, 0xde, 0x5d, 0xdf, 0xdf, 0x30, 0x53, 0x03, 0x70, 0xfb, 0xb2,
+    0x2d, 0x37, 0xeb, 0x19, 0xbc, 0xd2, 0x90, 0x44, 0x25, 0x42, 0x06, 0x30,
+    0xc8, 0xcf, 0x4b, 0x0a, 0x01, 0x13, 0x5e, 0x17, 0x91, 0xc7, 0xcb, 0x79,
+    0xed, 0x06, 0x39, 0xc1, 0x2e, 0x92, 0x29, 0xf5, 0xff, 0x24, 0xe7, 0x2b,
+    0x3f, 0x19, 0x35, 0x6b, 0x3d, 0x69, 0xa2, 0x19, 0x20, 0x53, 0xd4, 0xca,
+    0x08, 0x35, 0x6e, 0xe0, 0x5a, 0x9a, 0x9d, 0x48, 0xf5, 0x20, 0x24, 0x20,
+    0x33, 0x94, 0x6b, 0x33, 0xdd, 0x78, 0xbf, 0x62, 0xf1, 0x43, 0x08, 0x97,
+    0x53, 0x98, 0xe4, 0x17, 0x27, 0xfc, 0xe8, 0xf1, 0xb8, 0x4c, 0xb3, 0x79,
+    0xc8, 0x05, 0x21, 0x1b, 0xe8, 0x56, 0xd2, 0x5f, 0xb6, 0x90, 0x14, 0x0c,
+    0x96, 0x38, 0xc6, 0xc3, 0x6d, 0x10, 0xbf, 0xc6, 0x28, 0xfe, 0x1f, 0x13,
+    0x81, 0x04, 0xeb, 0x37, 0x9c, 0x80, 0x52, 0x47, 0x0f, 0xa0, 0x6e, 0xcd,
+    0x9c, 0x44, 0xdd, 0x61, 0x9c, 0x8f, 0xb2, 0xf5, 0xe0, 0xa0, 0x2b, 0x2f,
+    0xe7, 0x67, 0xd0, 0xd7, 0x29, 0x08, 0x72, 0xee, 0xd5, 0x60, 0xb9, 0xbb,
+    0x1b, 0x12, 0xce, 0x60, 0x98, 0xb9, 0x40, 0xd3, 0xd9, 0x77, 0x5d, 0x6b,
+    0x78, 0xaa, 0x9a, 0x47, 0x2a, 0xf5, 0x38, 0xbb, 0xbe, 0x3a, 0x82, 0x6a,
+    0xbf, 0x8b, 0x67, 0x7e, 0xa4, 0x78, 0xbf, 0xcf, 0x58, 0xce, 0x86, 0x2e,
+    0x34, 0xb7, 0x76, 0x99, 0xa5, 0xf1, 0x0c, 0xa9, 0x1c, 0x9f, 0xad, 0xcb,
+    0xac, 0xf4, 0x03, 0x60, 0xe0, 0x22, 0xfe, 0x02, 0x34, 0x9a, 0x14, 0xb9,
+    0x11, 0xea, 0x4c, 0x3a, 0x59, 0xaa, 0xec, 0x8f, 0x82, 0x49, 0x23, 0xa2,
+    0xd0, 0xf7, 0xc3, 0xf0, 0xaa, 0x2d, 0xb2, 0xb8, 0xce, 0x02, 0x2f, 0xe0,
+    0x23, 0x49, 0xa1, 0x38, 0x12, 0xba, 0xab, 0x9f, 0x60, 0xe4, 0x0d, 0xfa,
+    0x2b, 0xcc, 0xad, 0x6a, 0x06, 0xca, 0x38, 0x82, 0xc5, 0x88, 0x10, 0xb6,
+    0xf5, 0xf6, 0x06, 0x7b, 0x03, 0x9c, 0xe4, 0x89, 0xaf, 0xdb, 0x66, 0x45,
+    0xeb, 0x2c, 0x28, 0xe2, 0x40, 0x08, 0x44, 0xe2, 0x8a, 0x91, 0x19, 0x04,
+    0x29, 0x46, 0xa7, 0xb5, 0x78, 0xae, 0x05, 0xcc, 0x38, 0x9f, 0xd8, 0x58,
+    0xc9, 0x79, 0xf9, 0xad, 0x77, 0x66, 0x49, 0x62, 0xef, 0x13, 0x72, 0xee,
+    0xda, 0x37, 0xb5, 0xd7, 0xf1, 0x51, 0x5d, 0x16, 0x11, 0xf3, 0x91, 0xf2,
+    0x13, 0x49, 0x09, 0x50, 0x15, 0xc6, 0x48, 0xe6, 0xe9, 0x4c, 0xf0, 0x06,
+    0x14, 0x3f, 0xef, 0x46, 0x15, 0xaf, 0x96, 0x0d, 0x17, 0x51, 0x08, 0xf2,
+    0xe1, 0xc9, 0xb9, 0x1d, 0x8d, 0x8f, 0x74, 0x25, 0x04, 0x1f, 0x2c, 0x62,
+    0x67, 0xe4, 0x4b, 0xdc, 0x67, 0x39, 0x2c, 0x7d, 0x3a, 0x1e, 0x6f, 0x5b,
+    0x0b, 0xab, 0x0b, 0x1f, 0x64, 0x37, 0x19, 0x4f, 0x6b, 0x07, 0x05, 0xff,
+    0x6e, 0x89, 0x8f, 0x22, 0x7d, 0x28, 0xd9, 0x3b, 0x9a, 0xe2, 0x3f, 0xff,
+    0xc2, 0xb1, 0xca, 0x05, 0xbc, 0x05, 0xa5, 0xe7, 0x2d, 0x66, 0xf7, 0x37,
+    0x92, 0xd2, 0xb4, 0x35, 0x26, 0x3f, 0x8c, 0x0c, 0x22, 0xa5, 0x5f, 0x5e,
+    0x9c, 0x01, 0x46, 0x91, 0xe7, 0xa2, 0x92, 0x97, 0x0a, 0x19, 0x85, 0x2f,
+    0x54, 0xe3, 0xa8, 0x26, 0xab, 0xe6, 0xb5, 0xd9, 0x71, 0x19, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x98,
+};
+static_assert(sizeof(kBytesTestReadSymbol9) == kNumBytesTestReadSymbol9, "");
+
+// The kBytesTestReadSymbol10[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][11] = {
+//   // pmf: 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10
+//   { 32768 - 3277, 32768 - 6554, 32768 - 9830, 32768 - 13107, 32768 - 16384,
+//     32768 - 19661, 32768 - 22938, 32768 - 26214, 32768 - 29491, 0, 0 },
+//   // pmf: 3/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 1/20
+//   { 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746, 32768 - 18022,
+//     32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0 },
+//   // pmf: 1/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 3/20
+//   { 32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+//     32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853, 0, 0 },
+//   // pmf: 1/20, 2/20, 2/20, 2/20, 3/20, 3/20, 2/20, 2/20, 2/20, 1/20
+//   { 32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 16384,
+//     32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0 },
+// };
+// constexpr int kSymbols[20][4] = { { 0, 5, 9, 4 },  //
+//                                   { 1, 6, 8, 3 },  //
+//                                   { 2, 7, 7, 2 },  //
+//                                   { 3, 8, 6, 1 },  //
+//                                   { 4, 9, 5, 0 },  //
+//                                   { 5, 0, 4, 9 },  //
+//                                   { 6, 1, 3, 8 },  //
+//                                   { 7, 2, 2, 7 },  //
+//                                   { 8, 3, 1, 6 },  //
+//                                   { 9, 4, 0, 5 },  //
+//                                   { 0, 0, 9, 7 },  //
+//                                   { 2, 1, 8, 5 },  //
+//                                   { 4, 3, 6, 3 },  //
+//                                   { 6, 5, 4, 1 },  //
+//                                   { 8, 7, 2, 8 },  //
+//                                   { 1, 0, 9, 6 },  //
+//                                   { 3, 2, 7, 4 },  //
+//                                   { 5, 4, 5, 2 },  //
+//                                   { 7, 6, 3, 5 },  //
+//                                   { 9, 8, 1, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 96; ++i) {
+//   for (int j = 0; j < 20; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 10);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol10 = 3204;
+constexpr uint8_t kBytesTestReadSymbol10[] = {
+    0x10, 0x84, 0xe2, 0xe0, 0x0f, 0x08, 0xd6, 0x01, 0xd0, 0xaa, 0xd8, 0xb5,
+    0x60, 0x4f, 0xb9, 0xb3, 0x73, 0x01, 0x8c, 0x92, 0xe6, 0xa0, 0xab, 0xe8,
+    0xe4, 0x95, 0x85, 0x03, 0x5f, 0xbb, 0x3b, 0x1f, 0x27, 0xb1, 0x44, 0x95,
+    0x50, 0x1f, 0xad, 0xc8, 0x35, 0xde, 0x44, 0xf3, 0xb6, 0x8d, 0xa2, 0x39,
+    0xc3, 0xb6, 0xee, 0x3c, 0x10, 0x33, 0x27, 0x7a, 0x29, 0xcc, 0x7c, 0x08,
+    0xcb, 0x94, 0xbe, 0xef, 0x96, 0x47, 0x30, 0x49, 0x47, 0x9c, 0xb7, 0x7e,
+    0x23, 0x0c, 0x27, 0x8e, 0x1b, 0xdc, 0x6c, 0x92, 0x40, 0x98, 0xbf, 0x20,
+    0xd4, 0x01, 0x72, 0x55, 0x8c, 0x3f, 0x3c, 0x76, 0x24, 0xd2, 0x2d, 0xba,
+    0xa4, 0x54, 0x29, 0x80, 0xe9, 0x06, 0x2c, 0x68, 0xbd, 0xa7, 0xc5, 0xf7,
+    0x44, 0xdf, 0x7e, 0x94, 0x90, 0x3f, 0x94, 0x7d, 0x9e, 0x36, 0xb8, 0x82,
+    0x1d, 0x4a, 0x47, 0x1f, 0x6c, 0x29, 0x51, 0xd2, 0x84, 0xa8, 0xcd, 0x98,
+    0xc0, 0xd2, 0xea, 0x4a, 0x25, 0x3c, 0xd7, 0x34, 0x64, 0x96, 0xd4, 0x06,
+    0xed, 0x00, 0x98, 0xc3, 0x65, 0x10, 0xd4, 0xac, 0x6b, 0xab, 0xd7, 0x35,
+    0x04, 0x89, 0xbf, 0x24, 0xcc, 0xfc, 0xc9, 0xe8, 0x87, 0x3d, 0xdb, 0x55,
+    0xf0, 0xc9, 0x97, 0x71, 0x99, 0x00, 0x54, 0x50, 0x24, 0x66, 0xca, 0x24,
+    0xfd, 0x1c, 0xb1, 0x71, 0x0e, 0xb5, 0x9c, 0x27, 0xfc, 0x7f, 0x95, 0x98,
+    0xc8, 0x99, 0x9f, 0x9b, 0xc7, 0xf6, 0x69, 0xfa, 0xb2, 0x11, 0x77, 0x8d,
+    0x02, 0x53, 0x32, 0x4e, 0x20, 0x2c, 0x21, 0x2b, 0x99, 0x9a, 0xec, 0x63,
+    0x0b, 0xe2, 0x8f, 0x30, 0xf8, 0x3c, 0xd1, 0xb1, 0xbc, 0x52, 0x73, 0xce,
+    0x85, 0x54, 0xdd, 0xe6, 0xf6, 0x9c, 0x2d, 0xca, 0x3d, 0xa8, 0x09, 0x34,
+    0xa8, 0x41, 0x9c, 0x03, 0x78, 0xbc, 0x67, 0x11, 0x9f, 0xbe, 0xde, 0x9a,
+    0x98, 0x8a, 0x8d, 0x0b, 0x88, 0x7f, 0xea, 0x82, 0x77, 0x61, 0x7a, 0xde,
+    0xb0, 0xb1, 0x46, 0x8d, 0x23, 0x69, 0x2f, 0x17, 0x05, 0xff, 0x4a, 0x9e,
+    0xf9, 0xb3, 0x9a, 0xd0, 0xc4, 0x81, 0xcf, 0xbc, 0xe6, 0x26, 0x2c, 0x37,
+    0x55, 0xec, 0xdc, 0x23, 0x05, 0xdf, 0x30, 0xcf, 0x5a, 0x4a, 0x0c, 0x08,
+    0xc0, 0xd7, 0x9d, 0x80, 0xc0, 0xa3, 0x56, 0x49, 0x41, 0xc4, 0xdd, 0xc5,
+    0x69, 0x5c, 0xe5, 0x6c, 0xc5, 0xae, 0x4c, 0x95, 0x45, 0xf2, 0xf6, 0xd6,
+    0x12, 0x25, 0xcc, 0x24, 0x56, 0x8c, 0x2b, 0x32, 0x51, 0x18, 0x1a, 0xec,
+    0xb0, 0x62, 0x40, 0x82, 0x59, 0xb8, 0x38, 0x9f, 0x9f, 0x73, 0xf5, 0xb3,
+    0xc3, 0x93, 0xa5, 0x4e, 0xab, 0x7f, 0x97, 0x56, 0x51, 0xb0, 0xff, 0x69,
+    0x73, 0xc2, 0xd0, 0x60, 0x93, 0x59, 0x2f, 0xc7, 0x84, 0x14, 0x7e, 0x68,
+    0xa7, 0x2b, 0x37, 0xb4, 0x2e, 0x69, 0x58, 0x55, 0x3c, 0xd2, 0xf1, 0xa8,
+    0x2b, 0x6e, 0xd5, 0x11, 0x1c, 0x1d, 0x17, 0xd5, 0xf1, 0xfa, 0x8b, 0xd1,
+    0x6c, 0xc2, 0x32, 0x9e, 0x66, 0x3e, 0x6a, 0x4a, 0x0e, 0xb8, 0xf9, 0xa8,
+    0x1c, 0x23, 0xb1, 0x7e, 0xe7, 0xa0, 0x27, 0x5b, 0x1e, 0x8f, 0x8a, 0xb1,
+    0x1e, 0x50, 0x99, 0x9c, 0x39, 0x5b, 0xa0, 0x76, 0xa2, 0x90, 0x20, 0xd5,
+    0x61, 0xf8, 0x96, 0x5a, 0xbc, 0x91, 0x5d, 0xfc, 0x1e, 0xed, 0xea, 0xd8,
+    0x10, 0x5d, 0x15, 0xfa, 0x2b, 0xa7, 0x77, 0xaf, 0xae, 0x64, 0xef, 0x06,
+    0xa4, 0xf7, 0x65, 0x58, 0xb8, 0x64, 0x47, 0xcd, 0xfa, 0x12, 0x8e, 0x7d,
+    0x5b, 0x96, 0x27, 0xda, 0xb9, 0x2a, 0x14, 0xfe, 0x3e, 0x57, 0xd7, 0x4e,
+    0x86, 0xb3, 0x36, 0xd7, 0x77, 0x2d, 0xf6, 0x1e, 0xf3, 0xfd, 0xdb, 0x9a,
+    0x92, 0x78, 0x0a, 0xa4, 0x17, 0xf1, 0x78, 0xfc, 0xc3, 0x6d, 0xa0, 0xf8,
+    0x07, 0x6a, 0x68, 0xb1, 0x1b, 0x00, 0x27, 0x65, 0x68, 0x76, 0x10, 0x39,
+    0x4b, 0x8a, 0x51, 0x7a, 0x53, 0x69, 0x79, 0xfc, 0xbc, 0xe6, 0xf4, 0x26,
+    0xc3, 0xbf, 0x3a, 0x64, 0x56, 0x7d, 0x5f, 0x76, 0xa2, 0x42, 0xd1, 0xad,
+    0x3f, 0xb8, 0xce, 0xfb, 0x79, 0x38, 0xf3, 0x85, 0x2a, 0x67, 0xf4, 0x71,
+    0xfe, 0x0b, 0x79, 0xee, 0x85, 0xe0, 0x61, 0x9c, 0x9d, 0xd5, 0xe0, 0x0a,
+    0xd7, 0xa6, 0x21, 0xc3, 0x60, 0xbf, 0xbd, 0x16, 0xca, 0xa0, 0x16, 0x9d,
+    0xc4, 0x14, 0x99, 0x03, 0x7e, 0xe6, 0x62, 0x6e, 0xbe, 0x18, 0x45, 0x5e,
+    0x15, 0x42, 0xac, 0x5b, 0x60, 0x9f, 0xbd, 0x1e, 0x8a, 0x58, 0x55, 0x75,
+    0xcf, 0xbb, 0x12, 0xcb, 0xc2, 0xf4, 0x01, 0xfc, 0x96, 0x8d, 0x97, 0x67,
+    0x94, 0x65, 0x6b, 0xd0, 0xeb, 0xff, 0x26, 0x30, 0x3a, 0xa0, 0xe9, 0x9b,
+    0xa7, 0x5e, 0x81, 0x2b, 0x8e, 0xf7, 0xd6, 0xbf, 0x6f, 0xe4, 0x33, 0xd5,
+    0xaa, 0x5a, 0x27, 0x18, 0x24, 0x76, 0x72, 0x72, 0x50, 0x72, 0x92, 0x88,
+    0x9f, 0x88, 0x81, 0x0f, 0x33, 0xa7, 0x99, 0x83, 0x53, 0x03, 0x8c, 0x2d,
+    0x36, 0x43, 0x52, 0x27, 0x27, 0x74, 0xcd, 0xf1, 0x1b, 0x76, 0x95, 0x11,
+    0xdf, 0x4e, 0xb3, 0xa5, 0x2e, 0xe4, 0xac, 0x3a, 0xfd, 0x9f, 0xab, 0x96,
+    0x7e, 0xb1, 0xf0, 0x19, 0x22, 0xc4, 0x06, 0x9b, 0xe7, 0xe2, 0xf8, 0xb4,
+    0x17, 0xbd, 0x9d, 0x14, 0xac, 0x11, 0xc9, 0x79, 0x8e, 0x01, 0x23, 0xc9,
+    0x6e, 0x5f, 0x96, 0x1e, 0x99, 0xe1, 0x19, 0x2c, 0xb1, 0x1b, 0x54, 0x30,
+    0x3a, 0xb1, 0xe7, 0xbf, 0xbf, 0x17, 0x3d, 0x9b, 0x86, 0xd7, 0x4b, 0x68,
+    0x46, 0xa6, 0xb0, 0x05, 0x66, 0x4b, 0x8a, 0xdc, 0x60, 0x60, 0x29, 0x95,
+    0x35, 0x4b, 0x6f, 0xf5, 0x73, 0x51, 0x52, 0xb6, 0xec, 0xef, 0x74, 0xcb,
+    0x0b, 0x00, 0x04, 0x15, 0xff, 0xb3, 0x13, 0xdd, 0x70, 0x5e, 0x65, 0xfc,
+    0xa6, 0xb1, 0x13, 0x59, 0x29, 0xd0, 0x2e, 0xc4, 0x55, 0xcb, 0x99, 0xac,
+    0xca, 0x48, 0x67, 0x3e, 0xfb, 0xfb, 0x54, 0xb7, 0x53, 0x32, 0xb4, 0x17,
+    0xf6, 0x78, 0xd1, 0x64, 0x67, 0x76, 0x33, 0x3a, 0xe9, 0x13, 0x8c, 0x9c,
+    0xf1, 0x74, 0xb7, 0xd1, 0x35, 0x41, 0xf2, 0x4d, 0x68, 0x53, 0x25, 0x57,
+    0x97, 0x33, 0x18, 0xea, 0x96, 0xea, 0x66, 0x56, 0x82, 0xfe, 0xcf, 0x1a,
+    0x2c, 0x8c, 0xee, 0xc6, 0x67, 0x5d, 0x22, 0x71, 0x93, 0x9e, 0x2e, 0x96,
+    0xfa, 0x26, 0xa8, 0x3e, 0x49, 0xad, 0x0a, 0x64, 0xaa, 0xf2, 0xe6, 0x63,
+    0x1d, 0x52, 0xfb, 0x67, 0x7e, 0x17, 0x91, 0x70, 0xef, 0x48, 0xe1, 0x2e,
+    0x48, 0xe4, 0x8a, 0xc2, 0x4c, 0x5f, 0x77, 0x7f, 0x03, 0x45, 0xf0, 0x8d,
+    0x44, 0xad, 0x1e, 0xef, 0xb5, 0x1f, 0x3c, 0x3c, 0x4e, 0x43, 0x87, 0xdd,
+    0xec, 0xd9, 0x6e, 0xd0, 0xe8, 0x47, 0x75, 0x5b, 0xe5, 0xc0, 0x76, 0xb1,
+    0x9c, 0x5b, 0x72, 0xeb, 0x15, 0x9c, 0x5a, 0xa1, 0x31, 0xc2, 0x46, 0xb4,
+    0xe7, 0x9b, 0x5d, 0x86, 0x23, 0x3f, 0x47, 0xd9, 0x9b, 0x31, 0x4e, 0xa6,
+    0x65, 0xe9, 0x2f, 0xa3, 0xf8, 0x34, 0x68, 0xf7, 0x61, 0xf5, 0x08, 0xc4,
+    0x8a, 0x10, 0xa1, 0x9b, 0xa9, 0x30, 0x25, 0x8d, 0xaf, 0x67, 0x07, 0x8e,
+    0x84, 0x62, 0xa5, 0xc3, 0x2f, 0x5d, 0x06, 0xaa, 0xd4, 0x02, 0x04, 0x77,
+    0xed, 0xf4, 0xe0, 0xa9, 0xca, 0x95, 0xa2, 0x91, 0xe0, 0x56, 0x64, 0xb6,
+    0xb8, 0x39, 0xda, 0x83, 0xc5, 0x10, 0x7e, 0xa6, 0x08, 0x10, 0x01, 0x15,
+    0x2b, 0x6e, 0xce, 0xfe, 0x43, 0x01, 0xa9, 0xcb, 0xfd, 0xd9, 0x1b, 0x7e,
+    0x11, 0x74, 0x96, 0x4a, 0x89, 0x3f, 0x07, 0xac, 0x74, 0xf9, 0x93, 0xb2,
+    0xf6, 0xed, 0xb3, 0x29, 0xab, 0xc5, 0x0a, 0x90, 0xb3, 0x71, 0x51, 0xa5,
+    0xba, 0x16, 0x01, 0xd4, 0x35, 0x11, 0xdc, 0xba, 0x27, 0xc3, 0x01, 0x05,
+    0x65, 0x91, 0x6b, 0xff, 0x33, 0xb9, 0x9d, 0x84, 0xf7, 0xc0, 0x2d, 0x4b,
+    0xf4, 0xb2, 0x39, 0xe4, 0x7d, 0x0f, 0xf6, 0x8d, 0xa4, 0x2c, 0xa2, 0x4d,
+    0x4e, 0x8a, 0x2e, 0xff, 0x84, 0x5f, 0x43, 0x93, 0xa3, 0x43, 0xa2, 0xe3,
+    0x23, 0x92, 0xf3, 0x57, 0xd2, 0x2e, 0x8e, 0xea, 0xff, 0x2c, 0x3d, 0x1f,
+    0xc6, 0x94, 0x77, 0x19, 0xf6, 0xdb, 0x16, 0x4e, 0xd0, 0x3f, 0x32, 0xf3,
+    0x7b, 0x89, 0x50, 0xc5, 0x5c, 0xfe, 0x86, 0xcf, 0xf6, 0x89, 0x88, 0xa3,
+    0xa8, 0xd9, 0x52, 0x23, 0x68, 0x31, 0x90, 0xe2, 0xd4, 0x3a, 0x62, 0xb4,
+    0xe6, 0x4e, 0xfa, 0x20, 0x21, 0xbf, 0xe5, 0x4e, 0x86, 0x6d, 0xbe, 0xbe,
+    0xc6, 0x25, 0x4b, 0xf2, 0x20, 0x6c, 0x4e, 0xfc, 0x93, 0x41, 0x3f, 0x8b,
+    0x29, 0x34, 0xb9, 0xd1, 0x61, 0xe0, 0x34, 0x83, 0x8e, 0x1f, 0x8c, 0x44,
+    0xe2, 0x95, 0x2e, 0x73, 0x48, 0x8f, 0xeb, 0xd0, 0x6c, 0xec, 0xc4, 0xf6,
+    0x48, 0x5e, 0xf7, 0x53, 0x3e, 0xa6, 0x77, 0x33, 0xb0, 0x9e, 0xf8, 0x05,
+    0xa9, 0x7e, 0x96, 0x47, 0x3c, 0x8f, 0xa1, 0xfe, 0xd1, 0xb4, 0x85, 0x94,
+    0x49, 0xa9, 0xd1, 0x45, 0xdf, 0xf0, 0x8b, 0xe8, 0x72, 0x74, 0x68, 0x74,
+    0x5c, 0x67, 0xc2, 0xbb, 0xcd, 0x7b, 0x6a, 0x2f, 0x6b, 0x0a, 0x1d, 0xec,
+    0x03, 0x48, 0xd2, 0x8e, 0xe3, 0x3e, 0xdb, 0x62, 0xc9, 0xda, 0x07, 0xe6,
+    0x5e, 0x6f, 0x71, 0x2a, 0x18, 0xab, 0x9f, 0xd0, 0xd9, 0xfe, 0xd1, 0xac,
+    0xf0, 0x21, 0xab, 0xd9, 0x70, 0x1e, 0xb9, 0x99, 0xa0, 0xcc, 0xeb, 0xe7,
+    0x87, 0xee, 0xd9, 0x8e, 0xd0, 0xe5, 0xc0, 0x58, 0x75, 0x37, 0x3d, 0x03,
+    0x4e, 0x18, 0x08, 0x27, 0xdd, 0x18, 0x38, 0x1b, 0xad, 0xf1, 0xd3, 0xcc,
+    0xa1, 0x65, 0x26, 0x97, 0x3a, 0x2c, 0x3c, 0x06, 0x90, 0x71, 0xc3, 0xf1,
+    0x88, 0x9c, 0x52, 0xa5, 0xce, 0x69, 0x11, 0xfd, 0x7a, 0x0d, 0x9d, 0x98,
+    0x9e, 0xc9, 0x0b, 0xde, 0xea, 0x67, 0xd4, 0xce, 0xe6, 0x76, 0x13, 0xdf,
+    0x00, 0xb5, 0x2f, 0xd2, 0xc8, 0xe7, 0x91, 0xf4, 0x3f, 0xda, 0x36, 0x90,
+    0xb2, 0x89, 0x35, 0x3a, 0x28, 0xbb, 0xfe, 0x11, 0x7d, 0x0e, 0x4e, 0x8d,
+    0x0e, 0x8b, 0x8c, 0xf8, 0x57, 0x79, 0xaf, 0x6d, 0x45, 0xed, 0x61, 0x43,
+    0xbd, 0x80, 0x69, 0x1a, 0x51, 0xdc, 0x67, 0xdb, 0x6c, 0x59, 0x3b, 0x40,
+    0xfc, 0xcb, 0xcd, 0xee, 0x25, 0x43, 0x15, 0x73, 0xfa, 0x1b, 0x3f, 0xda,
+    0x35, 0x9e, 0x04, 0x35, 0x7b, 0x2e, 0x03, 0xd7, 0x33, 0x34, 0x19, 0x9d,
+    0x7c, 0xf0, 0xfd, 0xdb, 0x31, 0xda, 0x1c, 0xb8, 0x0b, 0x0e, 0xa6, 0xe7,
+    0xa0, 0x69, 0xc3, 0x01, 0x04, 0xfb, 0xa3, 0x07, 0x03, 0x75, 0xbe, 0x3a,
+    0x79, 0x94, 0x2c, 0xa4, 0xd2, 0xe7, 0x45, 0x87, 0x80, 0xd2, 0x0e, 0x38,
+    0x7e, 0x31, 0x13, 0x8a, 0x54, 0xb9, 0xcd, 0x22, 0x3f, 0xaf, 0x41, 0xb3,
+    0xb3, 0x13, 0xd9, 0x21, 0x7b, 0xdd, 0x4c, 0xfa, 0x99, 0xdc, 0xce, 0xc2,
+    0x7b, 0xe0, 0x16, 0xa5, 0xfa, 0x59, 0x1c, 0xf2, 0x3e, 0x87, 0xfb, 0x46,
+    0xd2, 0x16, 0x51, 0x26, 0xa7, 0x45, 0x17, 0x7f, 0xc2, 0x2f, 0xa1, 0xc9,
+    0xd1, 0xa1, 0xd1, 0x71, 0x9f, 0x0a, 0xef, 0x35, 0xed, 0xa8, 0xbd, 0xac,
+    0x28, 0x77, 0xb0, 0x0d, 0x23, 0x4a, 0x3b, 0x8c, 0xfb, 0x6d, 0x8b, 0x27,
+    0x68, 0x1f, 0x99, 0x79, 0xbd, 0xc4, 0xa8, 0x62, 0xae, 0x7f, 0x43, 0x67,
+    0xfb, 0x46, 0xb3, 0xc0, 0x86, 0xaf, 0x65, 0xc0, 0x7a, 0xe6, 0x66, 0x83,
+    0x33, 0xaf, 0x9e, 0x1f, 0xbb, 0x66, 0x3b, 0x43, 0x97, 0x01, 0x61, 0xd4,
+    0xdc, 0xf4, 0x0d, 0x38, 0x60, 0x20, 0x9f, 0x74, 0x60, 0xe0, 0x6e, 0xb7,
+    0xc7, 0x4f, 0x32, 0x85, 0x94, 0x9a, 0x5c, 0xe8, 0xb0, 0xf0, 0x1a, 0x41,
+    0xc7, 0x0f, 0xc6, 0x22, 0x71, 0x4a, 0x97, 0x39, 0xa4, 0x47, 0xf5, 0xe8,
+    0x36, 0x76, 0x62, 0x7b, 0x24, 0x2f, 0x7b, 0xa9, 0x9f, 0x53, 0x3b, 0x99,
+    0xd8, 0x4f, 0x7c, 0x02, 0xd4, 0xbf, 0x4b, 0x23, 0x9e, 0x47, 0xd0, 0xff,
+    0x68, 0xda, 0x42, 0xca, 0x24, 0xd4, 0xe8, 0xa2, 0xef, 0xf8, 0x45, 0xf4,
+    0x39, 0x3a, 0x34, 0x3a, 0x2e, 0x33, 0xe1, 0x5d, 0xe6, 0xbd, 0xb5, 0x17,
+    0xb5, 0x85, 0x0e, 0xf6, 0x01, 0xa4, 0x69, 0x47, 0x71, 0x9f, 0x6d, 0xb1,
+    0x64, 0xed, 0x03, 0xf3, 0x2f, 0x37, 0xb8, 0x95, 0x0c, 0x55, 0xcf, 0xe8,
+    0x6c, 0xff, 0x68, 0xd6, 0x78, 0x10, 0xd5, 0xec, 0xb8, 0x0f, 0x5c, 0xcc,
+    0xd0, 0x66, 0x75, 0xf3, 0xc3, 0xf7, 0x6c, 0xc7, 0x68, 0x72, 0xe0, 0x2c,
+    0x3a, 0x9b, 0x9e, 0x81, 0xa7, 0x0c, 0x04, 0x13, 0xee, 0x8c, 0x1c, 0x0d,
+    0xd6, 0xf8, 0xe9, 0xe6, 0x50, 0xb2, 0x93, 0x4b, 0x9d, 0x16, 0x1e, 0x03,
+    0x48, 0x38, 0xe1, 0xf8, 0xc4, 0x4e, 0x29, 0x52, 0xe7, 0x34, 0x88, 0xfe,
+    0xbd, 0x06, 0xce, 0xcc, 0x4f, 0x64, 0x85, 0xef, 0x75, 0x33, 0xea, 0x67,
+    0x73, 0x3b, 0x09, 0xef, 0x80, 0x5a, 0x97, 0xe9, 0x64, 0x73, 0xc8, 0xfa,
+    0x1f, 0xed, 0x1b, 0x48, 0x59, 0x44, 0x9a, 0x9d, 0x14, 0x5d, 0xff, 0x08,
+    0xbe, 0x87, 0x27, 0x46, 0x87, 0x45, 0xc6, 0x7c, 0x2b, 0xbc, 0xd7, 0xb6,
+    0xa2, 0xf6, 0xb0, 0xa1, 0xde, 0xc0, 0x34, 0x8d, 0x28, 0xee, 0x33, 0xed,
+    0xb6, 0x2c, 0x9d, 0xa0, 0x7e, 0x65, 0xe6, 0xf7, 0x12, 0xa1, 0x8a, 0xb9,
+    0xfd, 0x0d, 0x9f, 0xed, 0x1a, 0xcf, 0x02, 0x1a, 0xbd, 0x97, 0x01, 0xeb,
+    0x99, 0x9a, 0x0c, 0xce, 0xbe, 0x78, 0x7e, 0xed, 0x98, 0xed, 0x0e, 0x5c,
+    0x05, 0x87, 0x53, 0x73, 0xd0, 0x34, 0xe1, 0x80, 0x82, 0x7d, 0xd1, 0x83,
+    0x81, 0xba, 0xdf, 0x1d, 0x3c, 0xca, 0x16, 0x52, 0x69, 0x73, 0xa2, 0xc3,
+    0xc0, 0x69, 0x07, 0x1c, 0x3f, 0x18, 0x89, 0xc5, 0x2a, 0x5c, 0xe6, 0x91,
+    0x1f, 0xd7, 0xa0, 0xd9, 0xd9, 0x89, 0xec, 0x90, 0xbd, 0xee, 0xa6, 0x7d,
+    0x4c, 0xee, 0x67, 0x61, 0x3d, 0xf0, 0x0b, 0x52, 0xfd, 0x2c, 0x8e, 0x79,
+    0x1f, 0x43, 0xfd, 0xa3, 0x69, 0x0b, 0x28, 0x93, 0x53, 0xa2, 0x8b, 0xbf,
+    0xe1, 0x17, 0xd0, 0xe4, 0xe8, 0xd0, 0xe8, 0xb8, 0xcf, 0x85, 0x77, 0x9a,
+    0xf6, 0xd4, 0x5e, 0xd6, 0x14, 0x3b, 0xd8, 0x06, 0x91, 0xa5, 0x1d, 0xc6,
+    0x7d, 0xb6, 0xc5, 0x93, 0xb4, 0x0f, 0xcc, 0xbc, 0xde, 0xe2, 0x54, 0x31,
+    0x57, 0x3f, 0xa1, 0xb3, 0xfd, 0xa3, 0x59, 0xe0, 0x43, 0x57, 0xb2, 0xe0,
+    0x3d, 0x73, 0x33, 0x41, 0x99, 0xd7, 0xcf, 0x0f, 0xdd, 0xb3, 0x1d, 0xa1,
+    0xcb, 0x80, 0xb0, 0xea, 0x6e, 0x7a, 0x06, 0x9c, 0x30, 0x10, 0x4f, 0xba,
+    0x30, 0x70, 0x37, 0x5b, 0xe3, 0xa7, 0x99, 0x42, 0xca, 0x4d, 0x2e, 0x74,
+    0x58, 0x78, 0x0d, 0x20, 0xe3, 0x87, 0xe3, 0x11, 0x38, 0xa5, 0x4b, 0x9c,
+    0xd2, 0x23, 0xfa, 0xf4, 0x1b, 0x3b, 0x31, 0x3d, 0x92, 0x17, 0xbd, 0xd4,
+    0xcf, 0xa9, 0x9d, 0xcc, 0xec, 0x27, 0xbe, 0x01, 0x6a, 0x5f, 0xa5, 0x91,
+    0xcf, 0x23, 0xe8, 0x7f, 0xb4, 0x6d, 0x21, 0x65, 0x12, 0x6a, 0x74, 0x51,
+    0x77, 0xfc, 0x22, 0xfa, 0x1c, 0x9d, 0x1a, 0x1d, 0x17, 0x19, 0xf0, 0xae,
+    0xf3, 0x5e, 0xda, 0x8b, 0xda, 0xc2, 0x87, 0x7b, 0x00, 0xd2, 0x34, 0xa3,
+    0xb8, 0xcf, 0xb6, 0xd8, 0xb2, 0x76, 0x81, 0xf9, 0x97, 0x9b, 0xdc, 0x4a,
+    0x86, 0x2a, 0xe7, 0xf4, 0x36, 0x7f, 0xb4, 0x6b, 0x3c, 0x08, 0x6a, 0xf6,
+    0x5c, 0x07, 0xae, 0x66, 0x68, 0x33, 0x3a, 0xf9, 0xe1, 0xfb, 0xb6, 0x63,
+    0xb4, 0x39, 0x70, 0x16, 0x1d, 0x4d, 0xcf, 0x40, 0xd3, 0x86, 0x02, 0x09,
+    0xf7, 0x46, 0x0e, 0x06, 0xda, 0x64, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+    0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+    0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+    0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+    0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+    0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+    0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+    0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+    0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+    0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+    0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+    0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+    0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+    0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+    0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+    0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+    0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+    0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+    0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1d, 0x80,
+};
+static_assert(sizeof(kBytesTestReadSymbol10) == kNumBytesTestReadSymbol10, "");
+
+// The kBytesTestReadSymbol11[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][12] = {
+//   // pmf: 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11
+//   { 32768 - 2979, 32768 - 5958, 32768 - 8937, 32768 - 11916, 32768 - 14895,
+//     32768 - 17873, 32768 - 20852, 32768 - 23831, 32768 - 26810,
+//     32768 - 29789, 0, 0 },
+//   // pmf: 3/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 1/22
+//   { 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405, 32768 - 16384,
+//     32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+//     32768 - 31279, 0, 0 },
+//   // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 3/22
+//   { 32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+//     32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+//     32768 - 28300, 0, 0 },
+//   // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 4/22, 2/22, 2/22, 2/22, 2/22, 1/22
+//   { 32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+//     32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+//     32768 - 31279, 0, 0 },
+// };
+// constexpr int kSymbols[22][4] = { { 0, 6, 10, 5 },   //
+//                                   { 1, 7, 9, 4 },    //
+//                                   { 2, 8, 8, 3 },    //
+//                                   { 3, 9, 7, 2 },    //
+//                                   { 4, 10, 6, 1 },   //
+//                                   { 5, 0, 5, 0 },    //
+//                                   { 6, 1, 4, 10 },   //
+//                                   { 7, 2, 3, 9 },    //
+//                                   { 8, 3, 2, 8 },    //
+//                                   { 9, 4, 1, 7 },    //
+//                                   { 10, 5, 0, 6 },   //
+//                                   { 0, 0, 10, 9 },   //
+//                                   { 2, 1, 8, 7 },    //
+//                                   { 4, 3, 6, 5 },    //
+//                                   { 6, 5, 4, 3 },    //
+//                                   { 8, 7, 2, 1 },    //
+//                                   { 10, 9, 10, 8 },  //
+//                                   { 1, 0, 9, 6 },    //
+//                                   { 3, 2, 7, 4 },    //
+//                                   { 5, 4, 5, 2 },    //
+//                                   { 7, 6, 3, 5 },    //
+//                                   { 9, 8, 1, 5 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 96; ++i) {
+//   for (int j = 0; j < 22; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 11);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol11 = 3673;
+constexpr uint8_t kBytesTestReadSymbol11[] = {
+    0x0f, 0xb4, 0x93, 0xdb, 0xbe, 0x10, 0xa5, 0x0b, 0xa6, 0x53, 0x86, 0x25,
+    0xaf, 0x5e, 0xf9, 0xd6, 0x10, 0xd8, 0x5e, 0x2b, 0x6d, 0xf2, 0xf8, 0x35,
+    0x97, 0xf6, 0x95, 0xeb, 0x67, 0x20, 0x49, 0x0e, 0x21, 0xb4, 0x73, 0x5e,
+    0x72, 0x06, 0xdd, 0x76, 0x99, 0x3d, 0x67, 0x37, 0x27, 0xea, 0x21, 0x80,
+    0xc6, 0xb8, 0xf7, 0x48, 0x5e, 0x11, 0xe2, 0xe7, 0x10, 0xad, 0x0b, 0x12,
+    0x52, 0xd4, 0xe3, 0x63, 0x2a, 0x1d, 0x41, 0xf4, 0xce, 0x5d, 0x58, 0x5f,
+    0x79, 0x6d, 0xdd, 0x4b, 0x3d, 0x99, 0xd9, 0x64, 0xdc, 0x08, 0x16, 0x1a,
+    0xf3, 0x8f, 0x1e, 0x33, 0xfe, 0x7a, 0x49, 0xaa, 0x98, 0xb9, 0xe2, 0xc6,
+    0x14, 0xb8, 0x51, 0x1f, 0x45, 0xce, 0xea, 0x97, 0xcd, 0xd0, 0x0b, 0x5d,
+    0x12, 0x31, 0xbe, 0x78, 0x98, 0xa3, 0x77, 0x6a, 0xa0, 0xef, 0x57, 0x3a,
+    0xc6, 0xe7, 0x52, 0x22, 0x06, 0x44, 0x35, 0x8e, 0xc9, 0xe8, 0x4f, 0x76,
+    0xd9, 0x77, 0x8c, 0x80, 0xc9, 0xfc, 0x20, 0x0d, 0xc0, 0x67, 0x95, 0x21,
+    0x93, 0x74, 0x4f, 0xf1, 0xf5, 0xdf, 0x5a, 0x10, 0xde, 0x57, 0xc8, 0x6e,
+    0x33, 0x40, 0xae, 0x36, 0x4a, 0xc8, 0x49, 0xbf, 0x0d, 0x6d, 0x74, 0x34,
+    0xff, 0xdc, 0x1b, 0xe3, 0xcf, 0xcf, 0xe6, 0xd1, 0xfb, 0x4d, 0xd5, 0x0e,
+    0x86, 0x83, 0x21, 0x12, 0xf8, 0x51, 0x2a, 0xc4, 0x87, 0xd8, 0x1b, 0x1d,
+    0xe7, 0x36, 0xb5, 0xc3, 0xf9, 0xf9, 0x8f, 0x0f, 0xc2, 0x21, 0x83, 0x75,
+    0x14, 0x81, 0x17, 0xb1, 0x9b, 0x51, 0x56, 0x1d, 0xa1, 0xaa, 0xff, 0xd4,
+    0x1f, 0xf3, 0x8d, 0xd1, 0x30, 0x53, 0x92, 0x69, 0xce, 0xf0, 0xc5, 0x75,
+    0xcf, 0xd2, 0x6e, 0x37, 0x74, 0x79, 0xc3, 0x50, 0x52, 0x01, 0xc4, 0x0f,
+    0x67, 0xe2, 0xb7, 0xe2, 0xf1, 0xcc, 0xd9, 0x49, 0xc4, 0x58, 0xbd, 0x8d,
+    0x91, 0xb8, 0x35, 0xbd, 0x64, 0x12, 0x24, 0x20, 0x20, 0x29, 0x23, 0x94,
+    0x85, 0xb6, 0xa8, 0x4e, 0xd4, 0x49, 0x09, 0x25, 0xc4, 0xc5, 0xa5, 0x0c,
+    0x76, 0xa9, 0x4a, 0x75, 0x0f, 0xb9, 0x57, 0x33, 0xcd, 0xfd, 0xf8, 0x8f,
+    0xae, 0x43, 0x48, 0xb8, 0xea, 0x87, 0x17, 0x0d, 0x3d, 0x8b, 0x9a, 0x21,
+    0xe8, 0xbf, 0xc8, 0x5e, 0x18, 0x48, 0xa3, 0xcd, 0x08, 0x59, 0x9b, 0xdb,
+    0x79, 0x5c, 0xe9, 0xa3, 0xe6, 0xba, 0x58, 0x53, 0x10, 0x9a, 0x2c, 0x2b,
+    0x10, 0x5b, 0x96, 0x9a, 0x1f, 0x8f, 0xc2, 0x7d, 0xee, 0xe9, 0xc2, 0xbc,
+    0x8f, 0x8b, 0xa7, 0x41, 0xb1, 0x33, 0x58, 0x6e, 0x25, 0x13, 0x3a, 0xd0,
+    0x78, 0x53, 0xda, 0xa2, 0x35, 0x23, 0x89, 0x39, 0xa7, 0xef, 0x94, 0xda,
+    0x2f, 0xc3, 0x17, 0x80, 0x27, 0xc7, 0x0f, 0xda, 0xfb, 0xda, 0x64, 0x3c,
+    0x94, 0x8c, 0x39, 0xd0, 0x06, 0x62, 0x6c, 0x0d, 0x26, 0xba, 0x4f, 0xcb,
+    0x8a, 0xa0, 0xbc, 0xeb, 0x3f, 0x65, 0x51, 0x8e, 0x1d, 0x2e, 0x9e, 0x5f,
+    0xe3, 0x15, 0x0e, 0x58, 0x4f, 0xb7, 0xb6, 0x64, 0x95, 0xe8, 0x0e, 0x00,
+    0x7c, 0x1e, 0xd9, 0xde, 0x35, 0x5a, 0xff, 0xd5, 0xe5, 0xb3, 0x64, 0xcc,
+    0x8b, 0x93, 0xbc, 0x2a, 0x25, 0x7d, 0x50, 0x92, 0x3e, 0x23, 0x4c, 0x07,
+    0x5e, 0xcf, 0xbb, 0x52, 0xd0, 0xc4, 0xd9, 0x77, 0x66, 0x01, 0x57, 0x1f,
+    0xa0, 0x9d, 0xb2, 0x6d, 0x4e, 0x36, 0xc1, 0x9a, 0x70, 0x4e, 0xa3, 0x5f,
+    0xf6, 0xf9, 0x50, 0x08, 0xcd, 0xf9, 0xe5, 0x76, 0x81, 0xea, 0x88, 0x2e,
+    0xf5, 0x2a, 0xd4, 0x31, 0x39, 0x8d, 0xfe, 0x1c, 0x15, 0x1d, 0x41, 0x2b,
+    0x55, 0xc7, 0xe8, 0x27, 0x6f, 0xc3, 0xf0, 0x23, 0x76, 0x9a, 0xb2, 0x87,
+    0x0c, 0x71, 0x3c, 0x73, 0xea, 0x20, 0x93, 0xf4, 0x21, 0x56, 0xfb, 0x8e,
+    0xd7, 0xaf, 0xc3, 0xd4, 0xf4, 0x31, 0x6f, 0xe8, 0x1f, 0x5b, 0x83, 0xa9,
+    0x2b, 0x83, 0x08, 0x2e, 0xa2, 0xf3, 0x6c, 0x06, 0xe5, 0x89, 0x73, 0x73,
+    0x98, 0x0e, 0x57, 0x07, 0x49, 0x68, 0xa4, 0xb2, 0x4a, 0x26, 0xd1, 0x91,
+    0x49, 0x87, 0x05, 0x55, 0xa4, 0x88, 0x7d, 0x3d, 0x57, 0x7c, 0x20, 0x8c,
+    0x2c, 0xea, 0x30, 0x63, 0x3a, 0xe4, 0xab, 0x27, 0x80, 0xab, 0xfb, 0x22,
+    0x8a, 0x0f, 0xe0, 0xe9, 0xc5, 0xd5, 0x4f, 0x8a, 0x2c, 0x28, 0x36, 0x63,
+    0xbd, 0xa3, 0xc4, 0x90, 0xe4, 0x9e, 0x98, 0xca, 0xce, 0xfc, 0x96, 0xb8,
+    0x22, 0x0d, 0x17, 0xc8, 0xad, 0xc7, 0x01, 0x38, 0x6e, 0x95, 0x30, 0x74,
+    0xda, 0xb8, 0xa9, 0xa8, 0xe6, 0xf2, 0x03, 0x41, 0xb2, 0x05, 0x37, 0x04,
+    0x8b, 0x51, 0xf9, 0xeb, 0x97, 0xdf, 0xe9, 0xa8, 0x5f, 0x11, 0x2f, 0x9f,
+    0x4f, 0xbe, 0xc1, 0x53, 0x2c, 0x75, 0x90, 0xca, 0xa3, 0x9b, 0xc1, 0x36,
+    0xa3, 0x03, 0x65, 0xab, 0x57, 0xc4, 0x0e, 0x8a, 0x41, 0xfc, 0x60, 0x65,
+    0x13, 0x87, 0x6d, 0xda, 0x00, 0xad, 0x56, 0x1c, 0x28, 0x7c, 0x4c, 0xa2,
+    0x92, 0xda, 0x23, 0x00, 0xe8, 0x60, 0x20, 0x59, 0x45, 0x4a, 0x26, 0xae,
+    0x22, 0x37, 0x7c, 0x14, 0xce, 0xff, 0x0d, 0xa9, 0xef, 0xfc, 0x93, 0xbd,
+    0xde, 0x2b, 0x0f, 0xc7, 0xc0, 0x8a, 0x90, 0x06, 0xec, 0x53, 0x9f, 0xc8,
+    0x5b, 0x7b, 0xe8, 0x38, 0x22, 0x75, 0xe9, 0x40, 0xbc, 0x62, 0xe9, 0x9d,
+    0x49, 0xab, 0x88, 0x8d, 0xdf, 0x05, 0x33, 0xbf, 0xc3, 0x69, 0x6c, 0x36,
+    0x71, 0x17, 0x70, 0xc1, 0xe0, 0xd1, 0x71, 0xcf, 0xd5, 0x48, 0x83, 0x50,
+    0x74, 0x07, 0xc4, 0xca, 0x29, 0x2d, 0xa2, 0x30, 0x0e, 0x86, 0x02, 0x05,
+    0x94, 0x54, 0xa2, 0x6a, 0xe2, 0x23, 0x77, 0xc1, 0x4c, 0xef, 0xa4, 0x8c,
+    0xbe, 0x6b, 0x0f, 0x7c, 0x05, 0x30, 0x78, 0x34, 0x5c, 0x73, 0xf5, 0x52,
+    0x20, 0xd4, 0x1d, 0x01, 0xca, 0x9f, 0x89, 0x3b, 0x91, 0x1d, 0x1f, 0x27,
+    0xe1, 0xf9, 0xe8, 0xd0, 0xb2, 0x56, 0x32, 0x15, 0x37, 0xa3, 0x08, 0x38,
+    0xb7, 0x57, 0xb4, 0x09, 0xfe, 0xf4, 0x72, 0xe1, 0x8f, 0x4b, 0x6b, 0x00,
+    0x8c, 0xc5, 0x39, 0xd5, 0x45, 0x45, 0xbb, 0xf6, 0xb7, 0x01, 0xde, 0xef,
+    0x8b, 0xaf, 0x85, 0x73, 0xc4, 0x93, 0x3f, 0xbe, 0xf8, 0x69, 0xbd, 0x71,
+    0xa9, 0x65, 0x6f, 0x22, 0xa6, 0xca, 0x36, 0xf0, 0x34, 0x1b, 0x20, 0x24,
+    0x6c, 0xd2, 0xe3, 0xbb, 0xb5, 0x80, 0xfc, 0xc4, 0x90, 0x54, 0x70, 0xab,
+    0xb7, 0xb9, 0xdb, 0xeb, 0x3b, 0x1d, 0x75, 0xc8, 0x82, 0x9a, 0x15, 0x8a,
+    0x88, 0xb0, 0x7a, 0x77, 0xcf, 0xdc, 0x96, 0x22, 0x4d, 0x08, 0x47, 0x9a,
+    0x06, 0x3e, 0x47, 0xb1, 0x54, 0xdf, 0x22, 0x9d, 0x75, 0x8f, 0xdb, 0xc4,
+    0x5a, 0xd0, 0xfe, 0x44, 0xc4, 0xce, 0x9a, 0x57, 0x0b, 0x20, 0x36, 0x07,
+    0xb1, 0xcf, 0xfe, 0xb4, 0x3e, 0x03, 0x1b, 0x5d, 0xac, 0x40, 0x54, 0x88,
+    0x52, 0x2e, 0x81, 0x8f, 0x3c, 0x52, 0x87, 0x68, 0x00, 0xa5, 0x95, 0xbc,
+    0xd9, 0x67, 0x87, 0xa0, 0x75, 0x78, 0xb6, 0xa9, 0xda, 0x76, 0x9d, 0xe4,
+    0x5a, 0x6d, 0xd5, 0x78, 0xcd, 0x7b, 0x26, 0x5f, 0xc0, 0x09, 0xab, 0x25,
+    0x16, 0x38, 0xa1, 0x86, 0xa7, 0x5e, 0x5e, 0x2d, 0x3e, 0x2f, 0x09, 0xdc,
+    0x31, 0x4d, 0x71, 0x2e, 0xec, 0x5f, 0xa0, 0xe0, 0x8f, 0x9c, 0xcd, 0x72,
+    0xc8, 0x05, 0xa3, 0xb0, 0xfc, 0x4c, 0xdb, 0x6b, 0x24, 0xf2, 0x92, 0x6b,
+    0x13, 0x79, 0x1c, 0x36, 0x90, 0x20, 0x71, 0xaa, 0x8c, 0x1c, 0xe4, 0xbf,
+    0x54, 0xf8, 0x48, 0x51, 0xd2, 0x9a, 0x23, 0xa0, 0x55, 0x38, 0x24, 0x17,
+    0x39, 0x89, 0x4f, 0xc9, 0x01, 0x77, 0x05, 0x16, 0x97, 0x3e, 0xac, 0x9f,
+    0xba, 0x4a, 0xb1, 0x7e, 0x47, 0x0d, 0xa4, 0x08, 0x1c, 0x6a, 0xa3, 0x07,
+    0x39, 0x2f, 0xd5, 0x3e, 0x12, 0x14, 0x74, 0xa6, 0x88, 0xe8, 0x15, 0x4e,
+    0x09, 0x05, 0xce, 0x62, 0x53, 0xf2, 0x40, 0x7b, 0x49, 0x58, 0xc8, 0x5d,
+    0x29, 0x54, 0xb1, 0xfd, 0xb0, 0xb2, 0x75, 0x2c, 0x55, 0x9f, 0xf9, 0x57,
+    0x58, 0xec, 0xfb, 0xff, 0xa3, 0xa0, 0x27, 0x02, 0x0e, 0xa7, 0x52, 0xe7,
+    0x9e, 0xbd, 0xb6, 0x1d, 0xe6, 0x7e, 0xa2, 0xc0, 0x95, 0xe1, 0x4d, 0xd5,
+    0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b, 0xe8, 0x34, 0xa7, 0x53, 0x15, 0x67,
+    0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff, 0xe8, 0xe8, 0x09, 0xc0, 0x83, 0xa9,
+    0xd4, 0xb9, 0xe7, 0xaf, 0x6d, 0x87, 0x79, 0x9f, 0xa8, 0xb0, 0x25, 0x78,
+    0x92, 0x0e, 0x9d, 0xf7, 0x55, 0xd9, 0x1a, 0xc5, 0x48, 0x6c, 0xbe, 0x66,
+    0xb0, 0xf7, 0xbf, 0x95, 0x75, 0x8e, 0xcf, 0xbf, 0xfa, 0x3a, 0x02, 0x70,
+    0x20, 0xde, 0xb0, 0xe4, 0xe4, 0x0e, 0x59, 0x44, 0x11, 0x28, 0xe1, 0x22,
+    0xe8, 0x0e, 0x5b, 0x62, 0x69, 0x46, 0xb2, 0x1a, 0x9b, 0x63, 0x75, 0x31,
+    0xb9, 0x4a, 0x90, 0x8d, 0x2e, 0xf8, 0xa8, 0xdb, 0x5a, 0x31, 0xcf, 0x9c,
+    0x99, 0xd5, 0x85, 0x99, 0x5e, 0x0a, 0x51, 0x8d, 0x0d, 0x77, 0x3c, 0x51,
+    0xe1, 0x98, 0x1c, 0x5a, 0xc1, 0xea, 0x38, 0x93, 0x44, 0xd7, 0xb6, 0xbb,
+    0xa1, 0x0f, 0x38, 0x75, 0x5e, 0xff, 0x2d, 0x93, 0xfa, 0x7d, 0xca, 0xf6,
+    0xb7, 0x4f, 0x5e, 0xbd, 0x3f, 0xbc, 0xb6, 0xc6, 0x7b, 0xae, 0x23, 0x97,
+    0xc7, 0xcb, 0xa7, 0x98, 0x37, 0xf4, 0xd6, 0x0c, 0x12, 0xd6, 0xad, 0xc7,
+    0x51, 0xb3, 0x0e, 0x88, 0x40, 0xfd, 0xf7, 0x1b, 0x29, 0xcf, 0xb8, 0x7c,
+    0x29, 0xa1, 0xa2, 0x72, 0x05, 0xa1, 0x0f, 0x43, 0xa8, 0xc4, 0x24, 0x49,
+    0x96, 0xbf, 0x56, 0xe4, 0xbf, 0xc7, 0x71, 0x5a, 0x18, 0x85, 0x65, 0xdd,
+    0x17, 0x95, 0x30, 0x18, 0x8b, 0x18, 0xd2, 0xb2, 0x3f, 0x2e, 0xe9, 0x69,
+    0x89, 0x90, 0xe0, 0x24, 0x08, 0x13, 0x23, 0x0a, 0x78, 0x59, 0x1e, 0xe6,
+    0x33, 0x0f, 0x12, 0x73, 0xba, 0xb3, 0x3c, 0x1d, 0x05, 0x71, 0x7a, 0xd7,
+    0x87, 0xd3, 0xaa, 0x7c, 0xb9, 0x3f, 0x74, 0x95, 0x62, 0xfc, 0x85, 0xac,
+    0xe0, 0xe9, 0xaa, 0x6f, 0x48, 0x4b, 0xdf, 0xb6, 0x9a, 0x7c, 0x24, 0x28,
+    0xe3, 0x6e, 0x40, 0xbd, 0x03, 0xab, 0xc5, 0xb5, 0x4e, 0xd3, 0xb4, 0xef,
+    0x23, 0x1e, 0x6e, 0xab, 0xc6, 0x70, 0x41, 0x6f, 0xf8, 0x5f, 0x41, 0xa5,
+    0x3a, 0x98, 0xab, 0x3f, 0xf2, 0xae, 0xb1, 0xd9, 0xf7, 0xff, 0xf0, 0x29,
+    0xdf, 0x01, 0xed, 0xe9, 0xa3, 0x49, 0xc6, 0x1a, 0xec, 0xa3, 0x4e, 0x59,
+    0x4b, 0xcd, 0x01, 0xcb, 0x6c, 0x4d, 0x28, 0xd6, 0x43, 0x53, 0x6c, 0x6e,
+    0xa6, 0x37, 0x29, 0x52, 0x11, 0xa5, 0xdf, 0x15, 0x1b, 0x6b, 0x46, 0x3a,
+    0x25, 0x93, 0x5c, 0x76, 0xdc, 0x12, 0xb8, 0x3e, 0xe0, 0xc4, 0xb8, 0xf8,
+    0x96, 0x8e, 0xde, 0x49, 0xff, 0x58, 0x3d, 0x47, 0x12, 0x68, 0x9a, 0xf6,
+    0xd7, 0x74, 0x21, 0xe7, 0x0e, 0xab, 0xdf, 0xe5, 0xb2, 0x7f, 0x4f, 0xb9,
+    0x5e, 0xd6, 0xf7, 0x7a, 0xc8, 0x7e, 0xd7, 0xc0, 0x81, 0x63, 0xff, 0x84,
+    0x30, 0x67, 0x40, 0x95, 0xcb, 0x03, 0x6b, 0xfb, 0x08, 0xd3, 0x09, 0xa8,
+    0x93, 0x11, 0xf7, 0xf3, 0x68, 0x89, 0x79, 0x0d, 0x74, 0xce, 0xe9, 0xc6,
+    0x83, 0xcd, 0xe0, 0x54, 0x51, 0xff, 0xe2, 0x3d, 0x76, 0x94, 0x72, 0xed,
+    0xb3, 0x66, 0x98, 0x97, 0xd9, 0x0b, 0x3b, 0x1d, 0x75, 0xc8, 0xfd, 0x9a,
+    0x15, 0x8a, 0x7c, 0xe9, 0xb6, 0x8e, 0x59, 0xf1, 0xbe, 0x8f, 0xe4, 0x3d,
+    0xdd, 0x72, 0x98, 0x71, 0xe5, 0xef, 0xdc, 0x86, 0x2f, 0x9d, 0x75, 0x8c,
+    0xe9, 0xbf, 0xd1, 0x89, 0xae, 0x44, 0xda, 0xa7, 0x69, 0xda, 0x77, 0x91,
+    0x8f, 0x37, 0x55, 0xe3, 0x38, 0x20, 0xb7, 0xfc, 0x2f, 0xa0, 0xd2, 0x9d,
+    0x4c, 0x55, 0x9f, 0xf9, 0x57, 0x58, 0xec, 0xfb, 0xff, 0xf8, 0x14, 0xef,
+    0x80, 0xf6, 0xf4, 0xd1, 0xa4, 0xe3, 0x0d, 0x76, 0x51, 0xa7, 0x2c, 0xa5,
+    0xe6, 0x80, 0xe5, 0xb6, 0x26, 0x94, 0x6b, 0x21, 0xa9, 0xb6, 0x37, 0x53,
+    0x1b, 0x94, 0xa9, 0x08, 0xd2, 0xef, 0x8a, 0x8d, 0xb5, 0xa3, 0x1d, 0x12,
+    0xc9, 0xae, 0x3b, 0x6e, 0x09, 0x5c, 0x1f, 0x70, 0x62, 0x5c, 0x7c, 0x4b,
+    0x47, 0x6f, 0x24, 0xff, 0xac, 0x1e, 0xa3, 0x89, 0x34, 0x4d, 0x7b, 0x6b,
+    0xba, 0x10, 0xf3, 0x87, 0x55, 0xef, 0xf2, 0xd9, 0x3f, 0xa7, 0xdc, 0xaf,
+    0x6b, 0x7b, 0xbd, 0x64, 0x3f, 0x6b, 0xe0, 0x40, 0xb1, 0xff, 0xc2, 0x18,
+    0x33, 0xa0, 0x4a, 0xe5, 0x81, 0xb5, 0xfd, 0x84, 0x69, 0x84, 0xd4, 0x49,
+    0x88, 0xfb, 0xf9, 0xb4, 0x44, 0xbc, 0x86, 0xba, 0x67, 0x74, 0xe3, 0x41,
+    0xe6, 0xf0, 0x2a, 0x28, 0xff, 0xf1, 0x1e, 0xbb, 0x4a, 0x39, 0x76, 0xd9,
+    0xb3, 0x4c, 0x4b, 0xec, 0x85, 0x9d, 0x8e, 0xba, 0xe4, 0x7e, 0xcd, 0x0a,
+    0xc5, 0x3e, 0x74, 0xdb, 0x47, 0x2c, 0xf8, 0xdf, 0x47, 0xf2, 0x1e, 0xee,
+    0xb9, 0x4c, 0x38, 0xf2, 0xf7, 0xee, 0x43, 0x17, 0xce, 0xba, 0xc6, 0x74,
+    0xdf, 0xe8, 0xc4, 0xd7, 0x22, 0x6d, 0x53, 0xb4, 0xed, 0x3b, 0xc8, 0xc7,
+    0x9b, 0xaa, 0xf1, 0x9c, 0x10, 0x5b, 0xfe, 0x17, 0xd0, 0x69, 0x4e, 0xa6,
+    0x2a, 0xcf, 0xfc, 0xab, 0xac, 0x76, 0x7d, 0xff, 0xfc, 0x0a, 0x77, 0xc0,
+    0x7b, 0x7a, 0x68, 0xd2, 0x71, 0x86, 0xbb, 0x28, 0xd3, 0x96, 0x52, 0xf3,
+    0x40, 0x72, 0xdb, 0x13, 0x4a, 0x35, 0x90, 0xd4, 0xdb, 0x1b, 0xa9, 0x8d,
+    0xca, 0x54, 0x84, 0x69, 0x77, 0xc5, 0x46, 0xda, 0xd1, 0x8e, 0x89, 0x64,
+    0xd7, 0x1d, 0xb7, 0x04, 0xae, 0x0f, 0xb8, 0x31, 0x2e, 0x3e, 0x25, 0xa3,
+    0xb7, 0x92, 0x7f, 0xd6, 0x0f, 0x51, 0xc4, 0x9a, 0x26, 0xbd, 0xb5, 0xdd,
+    0x08, 0x79, 0xc3, 0xaa, 0xf7, 0xf9, 0x6c, 0x9f, 0xd3, 0xee, 0x57, 0xb5,
+    0xbd, 0xde, 0xb2, 0x1f, 0xb5, 0xf0, 0x20, 0x58, 0xff, 0xe1, 0x0c, 0x19,
+    0xd0, 0x25, 0x72, 0xc0, 0xda, 0xfe, 0xc2, 0x34, 0xc2, 0x6a, 0x24, 0xc4,
+    0x7d, 0xfc, 0xda, 0x22, 0x5e, 0x43, 0x5d, 0x33, 0xba, 0x71, 0xa0, 0xf3,
+    0x78, 0x15, 0x14, 0x7f, 0xf8, 0x8f, 0x5d, 0xa5, 0x1c, 0xbb, 0x6c, 0xd9,
+    0xa6, 0x25, 0xf6, 0x42, 0xce, 0xc7, 0x5d, 0x72, 0x3f, 0x66, 0x85, 0x62,
+    0x9f, 0x3a, 0x6d, 0xa3, 0x96, 0x7c, 0x6f, 0xa3, 0xf9, 0x0f, 0x77, 0x5c,
+    0xa6, 0x1c, 0x79, 0x7b, 0xf7, 0x21, 0x8b, 0xe7, 0x5d, 0x63, 0x3a, 0x6f,
+    0xf4, 0x62, 0x6b, 0x91, 0x36, 0xa9, 0xda, 0x76, 0x9d, 0xe4, 0x63, 0xcd,
+    0xd5, 0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b, 0xe8, 0x34, 0xa7, 0x53, 0x15,
+    0x67, 0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff, 0xfe, 0x05, 0x3b, 0xe0, 0x3d,
+    0xbd, 0x34, 0x69, 0x38, 0xc3, 0x5d, 0x94, 0x69, 0xcb, 0x29, 0x79, 0xa0,
+    0x39, 0x6d, 0x89, 0xa5, 0x1a, 0xc8, 0x6a, 0x6d, 0x8d, 0xd4, 0xc6, 0xe5,
+    0x2a, 0x42, 0x34, 0xbb, 0xe2, 0xa3, 0x6d, 0x68, 0xc7, 0x44, 0xb2, 0x6b,
+    0x8e, 0xdb, 0x82, 0x57, 0x07, 0xdc, 0x18, 0x97, 0x1f, 0x12, 0xd1, 0xdb,
+    0xc9, 0x3f, 0xeb, 0x07, 0xa8, 0xe2, 0x4d, 0x13, 0x5e, 0xda, 0xee, 0x84,
+    0x3c, 0xe1, 0xd5, 0x7b, 0xfc, 0xb6, 0x4f, 0xe9, 0xf7, 0x2b, 0xda, 0xde,
+    0xef, 0x59, 0x0f, 0xda, 0xf8, 0x10, 0x2c, 0x7f, 0xf0, 0x86, 0x0c, 0xe8,
+    0x12, 0xb9, 0x60, 0x6d, 0x7f, 0x61, 0x1a, 0x61, 0x35, 0x12, 0x62, 0x3e,
+    0xfe, 0x6d, 0x11, 0x2f, 0x21, 0xae, 0x99, 0xdd, 0x38, 0xd0, 0x79, 0xbc,
+    0x0a, 0x8a, 0x3f, 0xfc, 0x47, 0xae, 0xd2, 0x8e, 0x5d, 0xb6, 0x6c, 0xd3,
+    0x12, 0xfb, 0x21, 0x67, 0x63, 0xae, 0xb9, 0x1f, 0xb3, 0x42, 0xb1, 0x4f,
+    0x9d, 0x36, 0xd1, 0xcb, 0x3e, 0x37, 0xd1, 0xfc, 0x87, 0xbb, 0xae, 0x53,
+    0x0e, 0x3c, 0xbd, 0xfb, 0x90, 0xc5, 0xf3, 0xae, 0xb1, 0x9d, 0x37, 0xfa,
+    0x31, 0x35, 0xc8, 0x9b, 0x54, 0xed, 0x3b, 0x4e, 0xf2, 0x31, 0xe6, 0xea,
+    0xbc, 0x67, 0x04, 0x16, 0xff, 0x85, 0xf4, 0x1a, 0x53, 0xa9, 0x8a, 0xb3,
+    0xff, 0x2a, 0xeb, 0x1d, 0x9f, 0x7f, 0xff, 0x02, 0x9d, 0xf0, 0x1e, 0xde,
+    0x9a, 0x34, 0x9c, 0x61, 0xae, 0xca, 0x34, 0xe5, 0x94, 0xbc, 0xd0, 0x1c,
+    0xb6, 0xc4, 0xd2, 0x8d, 0x64, 0x35, 0x36, 0xc6, 0xea, 0x63, 0x72, 0x95,
+    0x21, 0x1a, 0x5d, 0xf1, 0x51, 0xb6, 0xb4, 0x63, 0xa2, 0x59, 0x35, 0xc7,
+    0x6d, 0xc1, 0x2b, 0x83, 0xee, 0x0c, 0x4b, 0x8f, 0x89, 0x68, 0xed, 0xe4,
+    0x9f, 0xf5, 0x83, 0xd4, 0x71, 0x26, 0x89, 0xaf, 0x6d, 0x77, 0x42, 0x1e,
+    0x70, 0xea, 0xbd, 0xfe, 0x5b, 0x27, 0xf4, 0xfb, 0x95, 0xed, 0x6f, 0x77,
+    0xac, 0x87, 0xed, 0x7c, 0x08, 0x16, 0x3f, 0xf8, 0x43, 0x06, 0x74, 0x09,
+    0x5c, 0xb0, 0x36, 0xbf, 0xb0, 0x8d, 0x30, 0x9a, 0x89, 0x31, 0x1f, 0x7f,
+    0x36, 0x88, 0x97, 0x90, 0xd7, 0x4c, 0xee, 0x9c, 0x68, 0x3c, 0xde, 0x05,
+    0x45, 0x1f, 0xfe, 0x23, 0xd7, 0x69, 0x47, 0x2e, 0xdb, 0x36, 0x69, 0x89,
+    0x7d, 0x90, 0xb3, 0xb1, 0xd7, 0x5c, 0x8f, 0xd9, 0xa1, 0x58, 0xa7, 0xce,
+    0x9b, 0x68, 0xe5, 0x9f, 0x1b, 0xe8, 0xfe, 0x43, 0xdd, 0xd7, 0x29, 0x87,
+    0x1e, 0x5e, 0xfd, 0xc8, 0x62, 0xf9, 0xd7, 0x58, 0xce, 0x9b, 0xfd, 0x18,
+    0x9a, 0xe4, 0x4d, 0xaa, 0x76, 0x9d, 0xa7, 0x79, 0x18, 0xf3, 0x75, 0x5e,
+    0x33, 0x82, 0x0b, 0x7f, 0xc2, 0xfa, 0x0d, 0x29, 0xd4, 0xc5, 0x59, 0xff,
+    0x95, 0x75, 0x8e, 0xcf, 0xbf, 0xff, 0x81, 0x4e, 0xf8, 0x0f, 0x6f, 0x4d,
+    0x1a, 0x4e, 0x30, 0xd7, 0x65, 0x1a, 0x72, 0xca, 0x5e, 0x68, 0x0e, 0x5b,
+    0x62, 0x69, 0x46, 0xb2, 0x1a, 0x9b, 0x63, 0x75, 0x31, 0xb9, 0x4a, 0x90,
+    0x8d, 0x2e, 0xf8, 0xa8, 0xdb, 0x5a, 0x31, 0xd1, 0x2c, 0x9a, 0xe3, 0xb6,
+    0xe0, 0x95, 0xc1, 0xf7, 0x06, 0x25, 0xc7, 0xc4, 0xb4, 0x76, 0xf2, 0x4f,
+    0xfa, 0xc1, 0xea, 0x38, 0x93, 0x44, 0xd7, 0xb6, 0xbb, 0xa1, 0x0f, 0x38,
+    0x75, 0x5e, 0xff, 0x2d, 0x93, 0xfa, 0x7d, 0xca, 0xf6, 0xb7, 0xbb, 0xd6,
+    0x43, 0xf6, 0xbe, 0x04, 0x0b, 0x1f, 0xfc, 0x21, 0x83, 0x3a, 0x04, 0xae,
+    0x58, 0x1b, 0x5f, 0xd8, 0x46, 0x98, 0x4d, 0x44, 0x98, 0x8f, 0xbf, 0x9b,
+    0x44, 0x4b, 0xc8, 0x6b, 0xa6, 0x77, 0x4e, 0x34, 0x1e, 0x6f, 0x02, 0xa2,
+    0x8f, 0xff, 0x11, 0xeb, 0xb4, 0xa3, 0x97, 0x6d, 0x9b, 0x34, 0xc4, 0xbe,
+    0xc8, 0x59, 0xd8, 0xeb, 0xae, 0x47, 0xec, 0xd0, 0xac, 0x53, 0xe7, 0x4d,
+    0xb4, 0x72, 0xcf, 0x8d, 0xf4, 0x7f, 0x21, 0xee, 0xeb, 0x94, 0xc3, 0x8f,
+    0x2f, 0x7e, 0xe4, 0x31, 0x7c, 0xeb, 0xac, 0x67, 0x4d, 0xfe, 0x8c, 0x4d,
+    0x72, 0x26, 0xd5, 0x3b, 0x4e, 0xd3, 0xbc, 0x8c, 0x79, 0xba, 0xaf, 0x19,
+    0xc1, 0x05, 0xbf, 0xe1, 0x7d, 0x06, 0x94, 0xea, 0x62, 0xac, 0xff, 0xca,
+    0xba, 0xc7, 0x67, 0xdf, 0xff, 0xc0, 0xa7, 0x7c, 0x07, 0xb7, 0xa6, 0x8d,
+    0x27, 0x18, 0x6b, 0xb2, 0x8d, 0x39, 0x65, 0x2f, 0x34, 0x07, 0x2d, 0xb1,
+    0x34, 0xa3, 0x59, 0x0d, 0x4d, 0xb1, 0xba, 0x98, 0xdc, 0xa5, 0x48, 0x46,
+    0x97, 0x7c, 0x54, 0x6d, 0xad, 0x18, 0xe8, 0x96, 0x4d, 0x71, 0xdb, 0x70,
+    0x4a, 0xe0, 0xfb, 0x83, 0x12, 0xe3, 0xe2, 0x5a, 0x3b, 0x79, 0x27, 0xfd,
+    0x60, 0xf5, 0x1c, 0x49, 0xa2, 0x6b, 0xdb, 0x5d, 0xd0, 0x87, 0x9c, 0x3a,
+    0xaf, 0x7f, 0x96, 0xc9, 0xfd, 0x3e, 0xe5, 0x7b, 0x5b, 0xdd, 0xeb, 0x21,
+    0xfb, 0x5f, 0x02, 0x05, 0x8f, 0xfe, 0x10, 0xc1, 0x9d, 0x02, 0x57, 0x2c,
+    0x0d, 0xaf, 0xec, 0x23, 0x4c, 0x26, 0xa2, 0x4c, 0x47, 0xdf, 0xcd, 0xa2,
+    0x25, 0xe4, 0x35, 0xd3, 0x3b, 0xa7, 0x1a, 0x0f, 0x37, 0x81, 0x51, 0x47,
+    0xff, 0x88, 0xf5, 0xda, 0x51, 0xcb, 0xb6, 0xcd, 0x9a, 0x62, 0x5f, 0x64,
+    0x2c, 0xec, 0x75, 0xd7, 0x23, 0xf6, 0x68, 0x56, 0x29, 0xf3, 0xa6, 0xda,
+    0x39, 0x67, 0xc6, 0xfa, 0x3f, 0x90, 0xf7, 0x75, 0xca, 0x61, 0xc7, 0x97,
+    0xbf, 0x72, 0x18, 0xbe, 0x75, 0xd6, 0x33, 0xa6, 0xff, 0x46, 0x26, 0xb9,
+    0x13, 0x6a, 0x9d, 0xa7, 0x69, 0xde, 0x46, 0x3c, 0xdd, 0x57, 0x8c, 0xe0,
+    0x82, 0xdf, 0xf0, 0xbe, 0x83, 0x4a, 0x75, 0x31, 0x56, 0x7f, 0xe5, 0x5d,
+    0x63, 0xb3, 0xef, 0xff, 0xe0, 0x53, 0xbe, 0x03, 0xdb, 0xd3, 0x46, 0x93,
+    0x8c, 0x35, 0xd9, 0x46, 0x9c, 0xb2, 0x97, 0x9a, 0x03, 0x96, 0xd8, 0x9a,
+    0x51, 0xac, 0x86, 0xa6, 0xd8, 0xdd, 0x4c, 0x6e, 0x52, 0xa4, 0x23, 0x4b,
+    0xbe, 0x2a, 0x36, 0xd6, 0x8c, 0x74, 0x4b, 0x26, 0xb8, 0xed, 0xb8, 0x25,
+    0x70, 0x7d, 0xc1, 0x89, 0x71, 0xf1, 0x2d, 0x1d, 0xbc, 0x93, 0xfe, 0xb0,
+    0x7a, 0x8e, 0x24, 0xd1, 0x35, 0xed, 0xae, 0xe8, 0x43, 0xce, 0x1d, 0x57,
+    0xbf, 0xcb, 0x64, 0xfe, 0x9f, 0x72, 0xbd, 0xad, 0xee, 0xf5, 0x90, 0xfd,
+    0xaf, 0x81, 0x02, 0xc7, 0xff, 0x08, 0x60, 0xce, 0x81, 0x2b, 0x96, 0x06,
+    0xd7, 0xf6, 0x11, 0xa6, 0x13, 0x51, 0x26, 0x23, 0xef, 0xe6, 0xd1, 0x12,
+    0xf2, 0x1a, 0xe9, 0x9d, 0xd3, 0x8d, 0x07, 0x9b, 0xc0, 0xa8, 0xa3, 0xff,
+    0xc4, 0x7a, 0xed, 0x28, 0xe5, 0xdb, 0x66, 0xcd, 0x31, 0x2f, 0xb2, 0x16,
+    0x76, 0x3a, 0xeb, 0x91, 0xfb, 0x34, 0x2b, 0x14, 0xf9, 0xd3, 0x6d, 0x1c,
+    0xb3, 0xe3, 0x7d, 0x1f, 0xc8, 0x7b, 0xba, 0xe5, 0x30, 0xe3, 0xcb, 0xdf,
+    0xb9, 0x0c, 0x5f, 0x3a, 0xeb, 0x19, 0xd3, 0x7f, 0xa3, 0x13, 0x5c, 0x89,
+    0xb5, 0x4e, 0xd3, 0xb4, 0xef, 0x23, 0x1e, 0x6e, 0xab, 0xc6, 0x70, 0x41,
+    0x6f, 0xf8, 0x5f, 0x41, 0xa5, 0x3a, 0x98, 0xab, 0x3f, 0xf2, 0xae, 0xb1,
+    0xd9, 0xf7, 0xff, 0xf0, 0x29, 0xdf, 0x01, 0xed, 0xe9, 0xa3, 0x49, 0xc6,
+    0x1a, 0xec, 0xa3, 0x4e, 0x59, 0x4b, 0xcd, 0x01, 0xcb, 0x6c, 0x4d, 0x28,
+    0xd6, 0x43, 0x53, 0x6c, 0x6e, 0xa6, 0x37, 0x29, 0x52, 0x11, 0xa5, 0xdf,
+    0x15, 0x1b, 0x6b, 0x46, 0x3a, 0x25, 0x93, 0x5c, 0x76, 0xdc, 0x12, 0xb8,
+    0x3e, 0xe0, 0xc4, 0xb8, 0xf8, 0x96, 0x8e, 0xde, 0x49, 0xff, 0x58, 0x3d,
+    0x47, 0x12, 0x68, 0x9a, 0xf6, 0xd7, 0x74, 0x21, 0xe7, 0x0e, 0xab, 0xdf,
+    0xe5, 0xb2, 0x7f, 0x4f, 0xb9, 0x5e, 0xd6, 0xf7, 0x7a, 0xc8, 0x7e, 0xd7,
+    0xc0, 0x81, 0x63, 0xff, 0x84, 0x30, 0x67, 0x40, 0x95, 0xcb, 0x03, 0x6b,
+    0xfb, 0x08, 0xd3, 0x09, 0xa8, 0x93, 0x11, 0xf7, 0xf3, 0x68, 0x89, 0x79,
+    0x0d, 0x74, 0xce, 0xe9, 0xc6, 0x83, 0xcd, 0xe0, 0x54, 0x51, 0xff, 0xe2,
+    0x3d, 0x76, 0x94, 0x72, 0xed, 0xb3, 0x66, 0x98, 0x97, 0xd9, 0x0b, 0x3b,
+    0x1d, 0x75, 0xc8, 0xfd, 0x9a, 0x15, 0x8a, 0x7c, 0xe9, 0xb6, 0x8e, 0x59,
+    0xf1, 0xbe, 0x8f, 0xe4, 0x3d, 0xdd, 0x72, 0x98, 0x71, 0xe5, 0xef, 0xdc,
+    0x86, 0x2f, 0x9d, 0x75, 0x8c, 0xe9, 0xbf, 0xd1, 0x89, 0xae, 0x44, 0xda,
+    0xa7, 0x69, 0xda, 0x77, 0x91, 0x8f, 0x37, 0x55, 0xe3, 0x38, 0x20, 0xb7,
+    0xfc, 0x2f, 0xa0, 0xd2, 0x9d, 0x4c, 0x55, 0x9f, 0xf9, 0x57, 0x58, 0xec,
+    0xfb, 0xff, 0xf8, 0x14, 0xef, 0x80, 0xf6, 0xf4, 0xd1, 0xa4, 0xe3, 0x0d,
+    0x76, 0x51, 0xa7, 0x2c, 0xa5, 0xe6, 0x80, 0xe5, 0xb6, 0x26, 0x94, 0x6b,
+    0x21, 0xa9, 0xb6, 0x37, 0x53, 0x1b, 0x94, 0xa9, 0x08, 0xd2, 0xef, 0x8a,
+    0x8d, 0xb5, 0xa3, 0x1d, 0x12, 0xc9, 0xae, 0x3b, 0x6e, 0x09, 0x5c, 0x1f,
+    0x70, 0x62, 0x5c, 0x7c, 0x4b, 0x47, 0x6f, 0x24, 0xff, 0xac, 0x1e, 0xa3,
+    0x89, 0x34, 0x4d, 0x7b, 0x6b, 0xba, 0x10, 0xf3, 0x87, 0x55, 0xef, 0xf2,
+    0xd9, 0x3f, 0xa7, 0xdc, 0xaf, 0x6b, 0x7b, 0xbd, 0x64, 0x3f, 0x6b, 0xe0,
+    0x40, 0xb1, 0xff, 0xc2, 0x18, 0x33, 0xa0, 0x4a, 0xe5, 0x81, 0xb5, 0xfd,
+    0x84, 0x69, 0x84, 0xd4, 0x49, 0x88, 0xfb, 0xf9, 0xb4, 0x44, 0xbc, 0x86,
+    0xba, 0x67, 0x74, 0xe3, 0x41, 0xe6, 0xf0, 0x2a, 0x28, 0xff, 0xf1, 0x1e,
+    0xbb, 0x4a, 0x39, 0x76, 0xd9, 0xb3, 0x4c, 0x4b, 0xec, 0x85, 0x9d, 0x8e,
+    0xba, 0xe4, 0x7e, 0xcd, 0x0a, 0xc5, 0x3e, 0x74, 0xdb, 0x47, 0x2c, 0xf8,
+    0xdf, 0x47, 0xf2, 0x1e, 0xee, 0xb9, 0x4c, 0x38, 0xf2, 0xf7, 0xee, 0x43,
+    0x17, 0xce, 0xba, 0xc6, 0x74, 0xdf, 0xe8, 0xc4, 0xd7, 0x22, 0x6d, 0x53,
+    0xb4, 0xed, 0x3b, 0xc8, 0xc7, 0x9b, 0xaa, 0xf1, 0x9c, 0x10, 0x5b, 0xfe,
+    0x17, 0xd0, 0x69, 0x4e, 0xa6, 0x2a, 0xcf, 0xfc, 0xab, 0xac, 0x76, 0x7d,
+    0xff, 0xfc, 0x0a, 0x77, 0xc0, 0x7b, 0x7a, 0x68, 0xd2, 0x71, 0x86, 0xbb,
+    0x28, 0xd3, 0x96, 0x52, 0xf3, 0x40, 0x72, 0xdb, 0x13, 0x4a, 0x35, 0x90,
+    0xd4, 0xdb, 0x1b, 0xa9, 0x8d, 0xca, 0x54, 0x84, 0x69, 0x77, 0xc5, 0x46,
+    0xda, 0xd1, 0x8e, 0x89, 0x64, 0xd7, 0x1d, 0xb7, 0x04, 0xae, 0x0f, 0xb8,
+    0x31, 0x2e, 0x3e, 0x25, 0xa3, 0xb7, 0x92, 0x7f, 0xd6, 0x0f, 0x51, 0xc4,
+    0x9a, 0x26, 0xbd, 0xb5, 0xdd, 0x08, 0x79, 0xc3, 0xaa, 0xf7, 0xf9, 0x6c,
+    0x9f, 0xd3, 0xee, 0x57, 0xb5, 0xbd, 0xde, 0xb2, 0x1f, 0xb5, 0xf0, 0x20,
+    0x58, 0xff, 0xe1, 0x0c, 0x19, 0xd0, 0x25, 0x72, 0xc0, 0xda, 0xfe, 0xc2,
+    0x34, 0xc2, 0x6a, 0x24, 0xc4, 0x7d, 0xfc, 0xda, 0x22, 0x5e, 0x43, 0x5d,
+    0x33, 0xba, 0x71, 0xa0, 0xf3, 0x78, 0x15, 0x14, 0x7f, 0xf8, 0x8f, 0x5d,
+    0xa5, 0x1c, 0xbb, 0x6c, 0xd9, 0xa6, 0x25, 0xf6, 0x42, 0xce, 0xc7, 0x5d,
+    0x72, 0x3f, 0x66, 0x85, 0x62, 0x9f, 0x3a, 0x6d, 0xa3, 0x96, 0x7c, 0x6f,
+    0xa3, 0xf9, 0x0f, 0x77, 0x5c, 0xa6, 0x1c, 0x79, 0x7b, 0xf7, 0x21, 0x8b,
+    0xe7, 0x5d, 0x63, 0x3a, 0x6f, 0xf4, 0x62, 0x6b, 0x91, 0x36, 0xa9, 0xda,
+    0x76, 0x9d, 0xe4, 0x63, 0xcd, 0xd5, 0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b,
+    0xe8, 0x34, 0xa7, 0x53, 0x15, 0x67, 0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff,
+    0xfe, 0x05, 0x3b, 0xe0, 0x3d, 0xbd, 0x34, 0x69, 0x38, 0xc3, 0x5d, 0x94,
+    0x69, 0xcb, 0x29, 0x79, 0xa0, 0x39, 0x6d, 0x89, 0xa5, 0x1a, 0xc8, 0x6a,
+    0x6d, 0x8d, 0xd4, 0xc6, 0xe5, 0x2a, 0x42, 0x34, 0xbb, 0xe2, 0xa3, 0x6d,
+    0x68, 0xc7, 0x44, 0xb2, 0x6b, 0x8e, 0xdb, 0x82, 0x57, 0x07, 0xdc, 0x18,
+    0x97, 0x1f, 0x12, 0xd1, 0xdb, 0xc9, 0x3f, 0xeb, 0x07, 0xa8, 0xe2, 0x4d,
+    0x13, 0x5e, 0xda, 0xee, 0x84, 0x3c, 0xe1, 0xd5, 0x7b, 0xfc, 0xb6, 0x4f,
+    0xe9, 0xf7, 0x2b, 0xda, 0xde, 0xef, 0x59, 0x0f, 0xda, 0xf8, 0x10, 0x2c,
+    0x7f, 0xf0, 0x86, 0x0c, 0xe8, 0x12, 0xb9, 0x60, 0x6d, 0x7f, 0x61, 0x1a,
+    0x61, 0x35, 0x12, 0x62, 0x3e, 0xfe, 0x6d, 0x11, 0x2f, 0x21, 0xae, 0x99,
+    0xdd, 0x38, 0xd0, 0x79, 0xbc, 0x0a, 0x8a, 0x3f, 0xfc, 0x47, 0xae, 0xd2,
+    0x8e, 0x5d, 0xb6, 0x6c, 0xd3, 0x12, 0xfb, 0x21, 0x67, 0x63, 0xae, 0xb9,
+    0x1f, 0xb3, 0x42, 0xb1, 0x4f, 0x9d, 0x36, 0xd1, 0xcb, 0x3e, 0x37, 0xd1,
+    0xfc, 0x87, 0xbb, 0xae, 0x53, 0x0e, 0x3c, 0xbd, 0xfb, 0x90, 0xc5, 0xf3,
+    0xae, 0xb1, 0x9d, 0x37, 0xfa, 0x31, 0x35, 0xc8, 0x9b, 0x54, 0xed, 0x3b,
+    0x4e, 0xf2, 0x31, 0xe6, 0xea, 0xbc, 0x67, 0x04, 0x16, 0xff, 0x85, 0xf4,
+    0x1a, 0x53, 0xa9, 0x8a, 0xb3, 0xff, 0x2a, 0xeb, 0x1d, 0x9f, 0x7f, 0xff,
+    0x08,
+};
+static_assert(sizeof(kBytesTestReadSymbol11) == kNumBytesTestReadSymbol11, "");
+
+// The kBytesTestReadSymbol12[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][13] = {
+//   // pmf: 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12,
+//   // 1/12
+//   { 32768 - 2731, 32768 - 5461, 32768 - 8192, 32768 - 10923, 32768 - 13653,
+//     32768 - 16384, 32768 - 19115, 32768 - 21845, 32768 - 24576,
+//     32768 - 27307, 32768 - 30037, 0, 0 },
+//   // pmf: 3/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+//   // 1/24
+//   { 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288, 32768 - 15019,
+//     32768 - 17749, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+//     32768 - 28672, 32768 - 31403, 0, 0 },
+//   // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+//   // 3/24
+//   { 32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+//     32768 - 15019, 32768 - 17749, 32768 - 20480, 32768 - 23211,
+//     32768 - 25941, 32768 - 28672, 0, 0 },
+//   // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 3/24, 3/24, 2/24, 2/24, 2/24, 2/24,
+//   // 1/24
+//   { 32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+//     32768 - 16384, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+//     32768 - 28672, 32768 - 31403, 0, 0 },
+// };
+// constexpr int kSymbols[24][4] = { { 0, 6, 11, 5 },   //
+//                                   { 1, 7, 10, 4 },   //
+//                                   { 2, 8, 9, 3 },    //
+//                                   { 3, 9, 8, 2 },    //
+//                                   { 4, 10, 7, 1 },   //
+//                                   { 5, 11, 6, 0 },   //
+//                                   { 6, 0, 5, 11 },   //
+//                                   { 7, 1, 4, 10 },   //
+//                                   { 8, 2, 3, 9 },    //
+//                                   { 9, 3, 2, 8 },    //
+//                                   { 10, 4, 1, 7 },   //
+//                                   { 11, 5, 0, 6 },   //
+//                                   { 0, 0, 11, 9 },   //
+//                                   { 2, 1, 10, 7 },   //
+//                                   { 4, 3, 8, 5 },    //
+//                                   { 6, 5, 6, 3 },    //
+//                                   { 8, 7, 4, 1 },    //
+//                                   { 10, 9, 2, 10 },  //
+//                                   { 1, 0, 11, 8 },   //
+//                                   { 3, 2, 9, 6 },    //
+//                                   { 5, 4, 7, 4 },    //
+//                                   { 7, 6, 5, 2 },    //
+//                                   { 9, 8, 3, 6 },    //
+//                                   { 11, 10, 1, 5 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 80; ++i) {
+//   for (int j = 0; j < 24; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 12);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol12 = 3473;
+constexpr uint8_t kBytesTestReadSymbol12[] = {
+    0x0d, 0x17, 0xf5, 0xbd, 0x05, 0xd0, 0x9c, 0x5d, 0x10, 0xc5, 0x9e, 0xc4,
+    0x9f, 0xc6, 0xf4, 0x7d, 0xce, 0x67, 0x97, 0x49, 0xd1, 0x05, 0x54, 0xab,
+    0xda, 0x22, 0x5b, 0xbc, 0x9c, 0x11, 0xc8, 0x0b, 0xe9, 0x6d, 0xb1, 0x8a,
+    0x17, 0x06, 0x92, 0xed, 0xd4, 0x61, 0x48, 0x01, 0x64, 0x43, 0x65, 0x65,
+    0xfc, 0x35, 0x9d, 0xbb, 0x68, 0x3f, 0x77, 0xbc, 0x8d, 0xd9, 0x3b, 0x48,
+    0x77, 0x58, 0x2f, 0x19, 0xfa, 0x73, 0xa6, 0xc3, 0x65, 0x96, 0x6c, 0x9d,
+    0x99, 0xb8, 0x65, 0x2b, 0x94, 0x11, 0x21, 0xf4, 0x95, 0xa4, 0xcd, 0xf2,
+    0xbf, 0x65, 0x79, 0x34, 0x4b, 0xf6, 0x5c, 0xeb, 0xca, 0x07, 0x65, 0x4f,
+    0xae, 0x67, 0xd8, 0xdf, 0xec, 0xc9, 0xd2, 0x26, 0x2e, 0xac, 0xea, 0xa2,
+    0xbd, 0x0d, 0x79, 0x27, 0x91, 0xf5, 0x84, 0x89, 0xf9, 0x2a, 0xb3, 0x5e,
+    0x48, 0x4b, 0x2b, 0x89, 0xc0, 0xa5, 0x9f, 0x94, 0x07, 0x82, 0x36, 0x11,
+    0x65, 0x4d, 0xb0, 0xde, 0xac, 0xde, 0xac, 0xc0, 0x35, 0x7f, 0xf3, 0x9b,
+    0x01, 0x0c, 0x35, 0x8b, 0xb5, 0x22, 0xb8, 0xea, 0x1c, 0xab, 0xbe, 0x08,
+    0xd9, 0x23, 0x0a, 0x37, 0x95, 0x36, 0x3d, 0x28, 0xb3, 0x19, 0x34, 0x3a,
+    0x47, 0xf8, 0x45, 0x33, 0x7a, 0x65, 0xae, 0x80, 0x48, 0x01, 0x20, 0xe8,
+    0xcd, 0xb7, 0xce, 0xf7, 0xee, 0xd1, 0x50, 0x39, 0xec, 0xa6, 0x8b, 0xa0,
+    0xb5, 0x56, 0x76, 0x1a, 0xb4, 0x6b, 0x31, 0xcf, 0x32, 0x0f, 0xb1, 0xba,
+    0xb3, 0xa4, 0xb7, 0x34, 0xfe, 0x86, 0x87, 0xa7, 0x44, 0x70, 0x3b, 0x9e,
+    0x94, 0xc5, 0x43, 0x82, 0xf1, 0x1a, 0xa1, 0x10, 0x05, 0x7c, 0x04, 0x63,
+    0x5a, 0xfe, 0xc2, 0xb6, 0x15, 0x07, 0x3f, 0xb0, 0x3c, 0x43, 0x74, 0x33,
+    0xec, 0xb8, 0xe0, 0xf5, 0x79, 0x48, 0x7c, 0x50, 0x4f, 0x4b, 0xb9, 0x08,
+    0x33, 0xfd, 0x54, 0xd5, 0x6f, 0xdf, 0xca, 0xfe, 0x38, 0xa1, 0xeb, 0xa9,
+    0xaf, 0xa5, 0x8f, 0xcf, 0xb3, 0xda, 0x77, 0x3f, 0x63, 0xcb, 0x98, 0x2b,
+    0x71, 0x56, 0x60, 0xb4, 0x5c, 0x7d, 0x81, 0x85, 0xf3, 0x64, 0x9f, 0xf3,
+    0xc2, 0xec, 0x2a, 0x27, 0x9b, 0x5e, 0x39, 0x30, 0x10, 0x0d, 0x43, 0xdb,
+    0x9f, 0x7b, 0x8f, 0xb8, 0x09, 0xe2, 0x55, 0xb3, 0xc4, 0xb1, 0xeb, 0x23,
+    0xcd, 0x32, 0xde, 0x58, 0xc2, 0x35, 0xda, 0x5c, 0x9a, 0xf8, 0x2d, 0xc6,
+    0x19, 0x46, 0x64, 0x66, 0x5a, 0xdb, 0x53, 0xc8, 0x14, 0x41, 0xcc, 0x0c,
+    0x3f, 0xff, 0x3e, 0xbe, 0x29, 0xba, 0x5f, 0x68, 0xa9, 0x31, 0x39, 0x79,
+    0x2a, 0xfe, 0x14, 0x92, 0x8f, 0x2b, 0x31, 0xf1, 0x0a, 0x25, 0xd8, 0x22,
+    0xe1, 0xc7, 0xcd, 0xda, 0xea, 0x88, 0xfa, 0x6a, 0xb0, 0x69, 0x77, 0xf6,
+    0xd6, 0x46, 0xb9, 0xe6, 0x53, 0x09, 0x48, 0x65, 0xbd, 0xe6, 0xf8, 0xc0,
+    0x04, 0x71, 0x26, 0x21, 0xe8, 0xf9, 0xc1, 0x71, 0x73, 0x6b, 0x3d, 0x73,
+    0x16, 0x66, 0x38, 0xae, 0x59, 0xb9, 0xe3, 0x34, 0x8f, 0x17, 0x3c, 0x16,
+    0xaa, 0x3f, 0x61, 0x49, 0xb3, 0x06, 0xcc, 0xb3, 0xcb, 0x7e, 0x42, 0xf1,
+    0x2a, 0x0e, 0xb2, 0xcb, 0x1d, 0xf0, 0x0f, 0xc9, 0x20, 0xb1, 0x80, 0xce,
+    0x08, 0xb9, 0xfa, 0xca, 0x3c, 0xd5, 0x67, 0x47, 0x36, 0x17, 0xc1, 0xf7,
+    0x9d, 0x97, 0x79, 0x75, 0xee, 0xb0, 0xed, 0xfc, 0xd0, 0xdf, 0xc8, 0xa2,
+    0xc1, 0xae, 0x51, 0x53, 0x88, 0x05, 0x95, 0x73, 0x7e, 0xd9, 0x3b, 0x9d,
+    0xb0, 0x08, 0x37, 0xff, 0x51, 0x6f, 0xf9, 0xad, 0x60, 0xa5, 0x3a, 0xd6,
+    0xba, 0xea, 0xf6, 0xea, 0x91, 0x2e, 0x5a, 0xa9, 0xbf, 0xe2, 0x52, 0x46,
+    0x0c, 0xbd, 0x28, 0x2d, 0xa8, 0x5f, 0xc8, 0x41, 0x31, 0x53, 0x7a, 0x9f,
+    0xfa, 0x73, 0x06, 0xc5, 0xae, 0x59, 0x8d, 0xe3, 0x0d, 0xfa, 0x99, 0x7f,
+    0xee, 0xe4, 0x82, 0xd4, 0x36, 0x68, 0x09, 0x92, 0x09, 0xef, 0x70, 0x89,
+    0xc6, 0xfa, 0xc7, 0x7e, 0x0f, 0x24, 0x8e, 0xad, 0x4e, 0xd9, 0x4c, 0x11,
+    0xe7, 0x7d, 0x98, 0xf0, 0x80, 0x42, 0x0b, 0x86, 0x8d, 0x8e, 0x85, 0x97,
+    0xd2, 0x11, 0x0f, 0x04, 0x59, 0xaf, 0xa5, 0xec, 0xda, 0x75, 0x64, 0x51,
+    0x22, 0x7e, 0x38, 0x4b, 0xca, 0x9e, 0x82, 0x71, 0x72, 0x8d, 0x4c, 0xca,
+    0xe1, 0x77, 0xe5, 0xe0, 0x9d, 0x64, 0x01, 0x48, 0x49, 0xcd, 0x3b, 0x90,
+    0xd8, 0x9e, 0x15, 0x22, 0x76, 0xe0, 0x57, 0x06, 0x06, 0xaf, 0x2c, 0x09,
+    0xce, 0x4c, 0xfa, 0x8b, 0xbf, 0xa1, 0x1b, 0xe3, 0xe7, 0xa5, 0xa0, 0xc0,
+    0xc8, 0x4c, 0x79, 0x1b, 0xeb, 0x5d, 0xb8, 0x3b, 0x1c, 0x3f, 0xbc, 0x11,
+    0x8f, 0xa0, 0x08, 0x2b, 0xd3, 0xe3, 0xca, 0xbc, 0x41, 0xc2, 0xa4, 0x4e,
+    0xdc, 0x0a, 0xe1, 0x06, 0xef, 0x55, 0x13, 0xb3, 0xdd, 0xfd, 0xe2, 0x89,
+    0x5f, 0xb5, 0xf6, 0xa9, 0xd7, 0xae, 0xc1, 0x14, 0xb6, 0x19, 0xd8, 0x5b,
+    0x0f, 0x9a, 0xb0, 0xed, 0xc5, 0xc7, 0xa8, 0xa6, 0x08, 0x5a, 0x00, 0xad,
+    0xf5, 0x9c, 0xb9, 0xd9, 0x45, 0x46, 0xf0, 0x9e, 0x2d, 0x55, 0xc6, 0x08,
+    0x60, 0x0d, 0x9e, 0xa7, 0x68, 0xb6, 0xf7, 0xf3, 0xa9, 0x84, 0x7e, 0x63,
+    0xe8, 0x48, 0x03, 0x1c, 0x15, 0x97, 0x94, 0xda, 0x04, 0xb2, 0xd0, 0x09,
+    0xa5, 0x62, 0x21, 0x70, 0x88, 0x9f, 0xf5, 0x0c, 0x91, 0x0d, 0xbf, 0x69,
+    0xe1, 0x6b, 0x4f, 0xc2, 0xf2, 0x32, 0xe1, 0x4b, 0xad, 0x58, 0xea, 0x0c,
+    0x07, 0x13, 0x4a, 0x1b, 0x87, 0x6d, 0x6e, 0x2f, 0xb6, 0xc6, 0x30, 0x1e,
+    0x2d, 0x1d, 0x5c, 0xdf, 0xd2, 0x5a, 0x88, 0xc8, 0x1c, 0xd9, 0xc3, 0x91,
+    0x04, 0x45, 0x63, 0x11, 0x44, 0x35, 0x7f, 0x46, 0xf4, 0xd0, 0xd1, 0x73,
+    0x9c, 0xae, 0x85, 0x5e, 0xda, 0xc7, 0xce, 0xb5, 0xbb, 0x3a, 0xb4, 0x67,
+    0xa5, 0xad, 0xc6, 0x5e, 0x12, 0xc7, 0xc5, 0x72, 0xfc, 0x35, 0x2e, 0xae,
+    0x46, 0x81, 0x22, 0x56, 0x6d, 0xc9, 0x36, 0x43, 0x17, 0x6b, 0x4d, 0x81,
+    0xd6, 0x59, 0x35, 0x90, 0x3a, 0xd2, 0xde, 0x79, 0xbd, 0x21, 0xc4, 0x56,
+    0xcb, 0x59, 0x3b, 0xe7, 0xb3, 0xab, 0x92, 0xce, 0x65, 0xc7, 0x20, 0xde,
+    0xde, 0xb1, 0x94, 0xac, 0x1a, 0x23, 0xa4, 0x14, 0x56, 0x32, 0xc0, 0x9f,
+    0x48, 0x31, 0xa6, 0x95, 0xc4, 0xb8, 0xf3, 0x9c, 0x8d, 0x34, 0x03, 0xc3,
+    0x62, 0x63, 0x38, 0x15, 0x71, 0x08, 0x5e, 0x1b, 0xc0, 0xf2, 0x54, 0x13,
+    0x66, 0x01, 0xf1, 0x38, 0xd9, 0x61, 0xf3, 0xdb, 0xd4, 0x83, 0x98, 0x3e,
+    0xaa, 0xe1, 0xca, 0x2d, 0xfb, 0x6d, 0x02, 0xac, 0xf2, 0xa6, 0x04, 0x09,
+    0xeb, 0xcb, 0xaf, 0xd5, 0x9d, 0x3d, 0xd7, 0xc2, 0xc1, 0x6f, 0xec, 0x53,
+    0x65, 0x0e, 0x40, 0x77, 0x03, 0xcd, 0x79, 0x0a, 0x94, 0x27, 0x6b, 0x6f,
+    0x32, 0xb3, 0xdb, 0x3e, 0x38, 0xe2, 0xd2, 0xca, 0x9b, 0x9e, 0x24, 0xc7,
+    0x35, 0xfd, 0xc1, 0x86, 0x78, 0xd9, 0xc3, 0xfe, 0x03, 0xb3, 0x3f, 0xc1,
+    0xf8, 0x09, 0x89, 0xdc, 0x3b, 0x08, 0xae, 0x85, 0xfa, 0x8e, 0x51, 0xbb,
+    0x6f, 0xf4, 0x73, 0x43, 0xd2, 0xed, 0x6d, 0xfd, 0x2b, 0x23, 0xc3, 0x4f,
+    0xc4, 0x1d, 0x25, 0xb9, 0x36, 0xc4, 0x98, 0xe6, 0xbf, 0xb8, 0x30, 0xcf,
+    0x1b, 0x38, 0x7f, 0xc0, 0x76, 0x67, 0xf8, 0x3f, 0x01, 0x31, 0x3b, 0x87,
+    0x60, 0xf9, 0x90, 0x01, 0x2c, 0x2f, 0xff, 0x6d, 0xfc, 0x8c, 0x3e, 0xeb,
+    0x7f, 0x96, 0x41, 0x82, 0xfd, 0xc6, 0x93, 0x8d, 0xfa, 0x4e, 0x48, 0x49,
+    0x33, 0x3a, 0xa3, 0x5e, 0x61, 0xdf, 0x88, 0x73, 0x66, 0x04, 0xf5, 0xe5,
+    0xd7, 0xea, 0xce, 0x9e, 0xeb, 0xe1, 0x60, 0xb7, 0xf1, 0xcc, 0x0d, 0xc1,
+    0xc4, 0xa0, 0x22, 0x0d, 0xe5, 0x8c, 0x8e, 0x26, 0xf9, 0x89, 0xa5, 0x02,
+    0xf6, 0x4c, 0x3f, 0x10, 0x74, 0x96, 0xe4, 0xdb, 0x12, 0x63, 0x9a, 0xfe,
+    0x70, 0x4e, 0x9a, 0x97, 0xc8, 0xad, 0x5f, 0x39, 0xa0, 0x81, 0x6a, 0xc4,
+    0x93, 0x50, 0x94, 0x1e, 0x17, 0xe3, 0x3f, 0x6d, 0x91, 0x01, 0xed, 0x49,
+    0x96, 0xed, 0x01, 0xc2, 0x2a, 0xe1, 0xc9, 0x39, 0x76, 0x1f, 0x87, 0xb6,
+    0xe3, 0x76, 0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xbf, 0x82, 0xa3,
+    0x6d, 0x87, 0x72, 0x2c, 0x7c, 0xdc, 0x3f, 0x2b, 0x6a, 0xf1, 0x9a, 0xe0,
+    0x0e, 0xc3, 0xdc, 0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x54, 0xab, 0xe3,
+    0xd6, 0x47, 0x90, 0x61, 0x87, 0x66, 0x08, 0x63, 0x95, 0x25, 0x20, 0x43,
+    0x6e, 0x05, 0x80, 0xad, 0x01, 0x10, 0xc7, 0x6c, 0x04, 0xbe, 0xaf, 0xc5,
+    0x50, 0xa7, 0x48, 0x4a, 0x47, 0x44, 0x71, 0xc9, 0xa5, 0xdb, 0xa2, 0x2b,
+    0x12, 0xbc, 0x40, 0x39, 0x31, 0x69, 0x83, 0x03, 0xb9, 0xa0, 0x46, 0xf0,
+    0xb4, 0x4b, 0x1b, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7e,
+    0xfe, 0x0a, 0x8d, 0xb6, 0x1d, 0xc8, 0xb1, 0xf3, 0x70, 0xfc, 0xad, 0xab,
+    0xc6, 0x6b, 0x80, 0xc8, 0xbb, 0x74, 0x45, 0x62, 0x57, 0x88, 0x07, 0x26,
+    0x2d, 0x30, 0x60, 0x77, 0x34, 0x08, 0xde, 0x16, 0x89, 0x63, 0x71, 0xbb,
+    0x50, 0xe3, 0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52,
+    0x53, 0x83, 0x53, 0xf6, 0x9e, 0x15, 0xb5, 0x78, 0xcd, 0x70, 0x19, 0x17,
+    0x6e, 0x88, 0xac, 0x4a, 0xf1, 0x00, 0xe4, 0xc5, 0xa6, 0x0c, 0x0e, 0xe6,
+    0x81, 0x1b, 0xc2, 0xd1, 0x2c, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed,
+    0xd2, 0xad, 0xfc, 0x15, 0xaa, 0x42, 0xea, 0x4a, 0x70, 0x6a, 0x7e, 0xd3,
+    0xc2, 0xb6, 0xaf, 0x19, 0xae, 0x03, 0x22, 0xed, 0xd1, 0x15, 0x89, 0x5e,
+    0x20, 0x1c, 0x98, 0xb4, 0xc1, 0x81, 0xdc, 0xd0, 0x23, 0x78, 0x5a, 0x25,
+    0x8d, 0xc6, 0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5,
+    0x48, 0x5d, 0x49, 0x4e, 0x0d, 0x4f, 0xda, 0x78, 0x56, 0xd5, 0xe3, 0x35,
+    0xc0, 0x64, 0x5d, 0xba, 0x22, 0xb1, 0x2b, 0xc4, 0x03, 0x93, 0x16, 0x98,
+    0x30, 0x3b, 0x9a, 0x04, 0x6f, 0x0b, 0x44, 0xb1, 0xb8, 0xdd, 0xa8, 0x71,
+    0x96, 0x07, 0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x29, 0xc1,
+    0xa9, 0xfb, 0x4f, 0x0a, 0xda, 0xbc, 0x66, 0xb8, 0x0c, 0x8b, 0xb7, 0x44,
+    0x56, 0x25, 0x78, 0x80, 0x72, 0x62, 0xd3, 0x06, 0x07, 0x73, 0x40, 0x8d,
+    0xe1, 0x68, 0x96, 0x37, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56,
+    0xfe, 0x0a, 0xd5, 0x21, 0x75, 0x25, 0x38, 0x35, 0x3f, 0x69, 0xe1, 0x5b,
+    0x57, 0x8c, 0xd7, 0x01, 0x91, 0x76, 0xe8, 0x8a, 0xc4, 0xaf, 0x10, 0x0e,
+    0x4c, 0x5a, 0x60, 0xc0, 0xee, 0x68, 0x11, 0xbc, 0x2d, 0x12, 0xc6, 0xe3,
+    0x76, 0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e,
+    0xa4, 0xa7, 0x06, 0xa7, 0xed, 0x3c, 0x2b, 0x6a, 0xf1, 0x9a, 0xe0, 0x32,
+    0x2e, 0xdd, 0x11, 0x58, 0x95, 0xe2, 0x01, 0xc9, 0x8b, 0x4c, 0x18, 0x1d,
+    0xcd, 0x02, 0x37, 0x85, 0xa2, 0x58, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03,
+    0xdb, 0xa5, 0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x94, 0xe0, 0xd4, 0xfd,
+    0xa7, 0x85, 0x6d, 0x5e, 0x33, 0x5c, 0x06, 0x45, 0xdb, 0xa2, 0x2b, 0x12,
+    0xbc, 0x40, 0x39, 0x31, 0x69, 0x83, 0x03, 0xb9, 0xa0, 0x46, 0xf0, 0xb4,
+    0x4b, 0x1b, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05,
+    0x6a, 0x90, 0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45,
+    0xee, 0xc5, 0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47,
+    0x09, 0x7b, 0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50,
+    0xe3, 0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d,
+    0x50, 0x6d, 0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36,
+    0x0c, 0x1f, 0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec,
+    0xc6, 0xc1, 0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2,
+    0xad, 0xfc, 0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f,
+    0x4d, 0x0d, 0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1,
+    0x17, 0x65, 0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d,
+    0xc6, 0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48,
+    0x5d, 0x49, 0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62,
+    0xfe, 0xf0, 0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd,
+    0xae, 0x33, 0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96,
+    0x07, 0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36,
+    0x83, 0xd2, 0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f,
+    0xf1, 0x2f, 0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60,
+    0xfc, 0x3d, 0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe,
+    0x0a, 0xd5, 0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86,
+    0x8b, 0xdd, 0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2,
+    0x8e, 0x12, 0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76,
+    0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4,
+    0x9a, 0xa0, 0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78,
+    0x6c, 0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19,
+    0xd9, 0x8d, 0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb,
+    0xa5, 0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9,
+    0x3e, 0x9a, 0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97,
+    0xc2, 0x2e, 0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e,
+    0xdb, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a,
+    0x90, 0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee,
+    0xc5, 0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09,
+    0x7b, 0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3,
+    0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50,
+    0x6d, 0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c,
+    0x1f, 0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6,
+    0xc1, 0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad,
+    0xfc, 0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d,
+    0x0d, 0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17,
+    0x65, 0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6,
+    0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d,
+    0x49, 0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe,
+    0xf0, 0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae,
+    0x33, 0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07,
+    0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83,
+    0xd2, 0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1,
+    0x2f, 0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc,
+    0x3d, 0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a,
+    0xd5, 0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b,
+    0xdd, 0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e,
+    0x12, 0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1,
+    0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a,
+    0xa0, 0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c,
+    0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9,
+    0x8d, 0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5,
+    0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e,
+    0x9a, 0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2,
+    0x2e, 0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb,
+    0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90,
+    0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5,
+    0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b,
+    0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c,
+    0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d,
+    0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f,
+    0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1,
+    0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc,
+    0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d,
+    0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65,
+    0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed,
+    0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49,
+    0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0,
+    0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33,
+    0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7,
+    0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2,
+    0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f,
+    0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d,
+    0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5,
+    0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd,
+    0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12,
+    0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6,
+    0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0,
+    0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18,
+    0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d,
+    0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b,
+    0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a,
+    0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e,
+    0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d,
+    0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba,
+    0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd,
+    0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c,
+    0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f,
+    0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07,
+    0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2,
+    0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8,
+    0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15,
+    0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17,
+    0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c,
+    0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43,
+    0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35,
+    0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8,
+    0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3,
+    0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a,
+    0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d,
+    0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84,
+    0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d, 0xb7,
+    0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5, 0x21,
+    0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd, 0x8b,
+    0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12, 0xf6,
+    0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6, 0x58,
+    0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0, 0xda,
+    0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18, 0x3f,
+    0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d, 0x83,
+    0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b, 0xf8,
+    0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a, 0x1a,
+    0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e, 0xca,
+    0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d, 0xda,
+    0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba, 0x92,
+    0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd, 0xe1,
+    0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c, 0x67,
+    0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f, 0x6e,
+    0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07, 0xa4,
+    0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2, 0x5f,
+    0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8, 0x7b,
+    0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15, 0xaa,
+    0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17, 0xbb,
+    0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c, 0x25,
+    0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43, 0x8c,
+    0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35, 0x41,
+    0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8, 0x30,
+    0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3, 0x1b,
+    0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a, 0xb7,
+    0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d, 0x34,
+    0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84, 0x5d,
+    0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d, 0xb7, 0x1b,
+    0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5, 0x21, 0x75,
+    0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd, 0x8b, 0xfb,
+    0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12, 0xf6, 0xb8,
+    0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6, 0x58, 0x1e,
+    0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0, 0xda, 0x0f,
+    0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18, 0x3f, 0xc4,
+    0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d, 0x83, 0xf0,
+    0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b, 0xf8, 0x2b,
+    0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a, 0x1a, 0x2f,
+    0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e, 0xca, 0x38,
+    0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d, 0xda, 0x87,
+    0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba, 0x92, 0x6a,
+    0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd, 0xe1, 0xb0,
+    0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c, 0x67, 0x66,
+    0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f, 0x6e, 0x95,
+    0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07, 0xa4, 0xfa,
+    0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2, 0x5f, 0x08,
+    0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8, 0x7b, 0x6e,
+    0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15, 0xaa, 0x42,
+    0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17, 0xbb, 0x17,
+    0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c, 0x25, 0xed,
+    0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43, 0x8c, 0xb0,
+    0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35, 0x41, 0xb4,
+    0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8, 0x30, 0x7f,
+    0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3, 0x1b, 0x07,
+    0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a, 0xb7, 0xf0,
+    0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d, 0x34, 0x34,
+    0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84, 0x5d, 0x94,
+    0x70, 0x97, 0xb5, 0xc6, 0x7c,
+};
+static_assert(sizeof(kBytesTestReadSymbol12) == kNumBytesTestReadSymbol12, "");
+
+// The kBytesTestReadSymbol13[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][14] = {
+//   // pmf: 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13,
+//   // 1/13, 1/13
+//   { 32768 - 2521, 32768 - 5041, 32768 - 7562, 32768 - 10082, 32768 - 12603,
+//     32768 - 15124, 32768 - 17644, 32768 - 20165, 32768 - 22686,
+//     32768 - 25206, 32768 - 27727, 32768 - 30247, 0, 0 },
+//   // pmf: 3/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+//   // 2/26, 1/26
+//   { 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343, 32768 - 13863,
+//     32768 - 16384, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+//     32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0 },
+//   // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+//   // 2/26, 3/26
+//   { 32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+//     32768 - 13863, 32768 - 16384, 32768 - 18905, 32768 - 21425,
+//     32768 - 23946, 32768 - 26466, 32768 - 28987, 0, 0 },
+//   // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 4/26, 2/26, 2/26, 2/26, 2/26,
+//   // 2/26, 1/26
+//   { 32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+//     32768 - 13863, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+//     32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0 },
+// };
+// constexpr int kSymbols[26][4] = { { 0, 6, 12, 5 },     //
+//                                   { 1, 7, 11, 4 },     //
+//                                   { 2, 8, 10, 3 },     //
+//                                   { 3, 9, 9, 2 },      //
+//                                   { 4, 10, 8, 1 },     //
+//                                   { 5, 11, 7, 0 },     //
+//                                   { 6, 12, 6, 12 },    //
+//                                   { 7, 0, 5, 11 },     //
+//                                   { 8, 1, 4, 10 },     //
+//                                   { 9, 2, 3, 9 },      //
+//                                   { 10, 3, 2, 8 },     //
+//                                   { 11, 4, 1, 7 },     //
+//                                   { 12, 5, 0, 6 },     //
+//                                   { 0, 0, 12, 11 },    //
+//                                   { 2, 1, 10, 9 },     //
+//                                   { 4, 3, 8, 7 },      //
+//                                   { 6, 5, 6, 5 },      //
+//                                   { 8, 7, 4, 3 },      //
+//                                   { 10, 9, 2, 1 },     //
+//                                   { 12, 11, 12, 10 },  //
+//                                   { 1, 0, 11, 8 },     //
+//                                   { 3, 2, 9, 6 },      //
+//                                   { 5, 4, 7, 4 },      //
+//                                   { 7, 6, 5, 2 },      //
+//                                   { 9, 8, 3, 6 },      //
+//                                   { 11, 10, 1, 6 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 64; ++i) {
+//   for (int j = 0; j < 26; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 13);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol13 = 3110;
+constexpr uint8_t kBytesTestReadSymbol13[] = {
+    0x0b, 0x38, 0xa7, 0x3e, 0xde, 0x47, 0x2e, 0xe6, 0x9e, 0xe0, 0xa8, 0xc4,
+    0x77, 0xda, 0x41, 0x64, 0x49, 0x60, 0xc4, 0x26, 0x68, 0xac, 0xf4, 0xa6,
+    0x8c, 0x6e, 0xa6, 0xd3, 0xd9, 0x4b, 0xb9, 0x35, 0xb6, 0x53, 0x6c, 0x73,
+    0x13, 0xd7, 0xfb, 0xbf, 0x96, 0xac, 0xea, 0x86, 0xb5, 0x24, 0x14, 0x2a,
+    0x5a, 0x41, 0x38, 0xab, 0xfb, 0x92, 0x74, 0xf4, 0x0f, 0x24, 0xde, 0x2d,
+    0x2d, 0x12, 0xd7, 0xb8, 0x2f, 0x4a, 0x4c, 0xd6, 0xc0, 0x4b, 0x01, 0x98,
+    0xca, 0x7e, 0xde, 0x03, 0x75, 0x27, 0x59, 0x4f, 0x32, 0x54, 0xa5, 0xb5,
+    0x79, 0xc3, 0xc4, 0x3c, 0x76, 0xa3, 0x2f, 0xaf, 0x2f, 0x0a, 0x84, 0xb5,
+    0x60, 0xf5, 0x73, 0x88, 0xc0, 0x24, 0x1c, 0xfb, 0xff, 0x90, 0xb6, 0x05,
+    0xe9, 0x43, 0x90, 0xc8, 0xd3, 0xfd, 0x3f, 0xc2, 0x0b, 0xb5, 0xfe, 0x12,
+    0x55, 0x23, 0xa1, 0xf4, 0xba, 0xc7, 0x1f, 0xc3, 0xe5, 0xe3, 0x76, 0x68,
+    0x3c, 0x57, 0xb9, 0x92, 0xea, 0x25, 0x93, 0x4e, 0x72, 0xff, 0x63, 0x28,
+    0x0c, 0x90, 0x1d, 0xb6, 0x42, 0xb2, 0x25, 0x79, 0x8e, 0xee, 0x0c, 0x56,
+    0x3d, 0x94, 0x3d, 0x80, 0xf2, 0x25, 0x6f, 0xd4, 0x93, 0x31, 0x18, 0x80,
+    0x5a, 0x3a, 0xbb, 0x4d, 0xbb, 0x77, 0xc3, 0xb0, 0x20, 0x0e, 0xd3, 0xd8,
+    0x10, 0x05, 0xb2, 0x81, 0x57, 0xf5, 0x8c, 0xe5, 0xac, 0x46, 0xc0, 0xae,
+    0x9c, 0x08, 0x9d, 0x51, 0xf3, 0x16, 0xb9, 0xd7, 0x90, 0xa7, 0x9f, 0x40,
+    0x5d, 0x14, 0xd1, 0xbd, 0xa2, 0x0b, 0xf3, 0xae, 0x3b, 0xfb, 0x0f, 0xe1,
+    0x1a, 0x6e, 0x63, 0x3b, 0xdb, 0x41, 0x8e, 0xe8, 0x1f, 0x20, 0x18, 0xbe,
+    0x69, 0x10, 0x86, 0x06, 0x06, 0x23, 0x3a, 0x40, 0xc1, 0x7f, 0x2e, 0x32,
+    0xb4, 0x23, 0xac, 0x4b, 0x25, 0x6b, 0xef, 0xaf, 0xec, 0x5c, 0xf2, 0xd0,
+    0x61, 0xb2, 0x3a, 0xa5, 0x3d, 0xcd, 0xf7, 0x99, 0x6b, 0x4e, 0xbb, 0x58,
+    0x6a, 0x4c, 0xd7, 0xc0, 0x77, 0xd9, 0xae, 0x15, 0x7e, 0xde, 0xc9, 0xd8,
+    0x24, 0x39, 0x3f, 0xa4, 0xf3, 0x24, 0x7e, 0xe0, 0x22, 0x19, 0x40, 0x3d,
+    0x0c, 0xb0, 0xb7, 0xe3, 0x4b, 0x82, 0x6f, 0x82, 0x0e, 0xb1, 0x91, 0xef,
+    0x84, 0x98, 0x69, 0x66, 0x24, 0xe7, 0x90, 0x13, 0x0d, 0xbd, 0x6b, 0x92,
+    0xee, 0x1c, 0x0f, 0xe7, 0xfa, 0xb9, 0xb4, 0x6c, 0x68, 0x98, 0x4c, 0x27,
+    0x42, 0xad, 0x5f, 0x8f, 0xe5, 0x25, 0xf9, 0x67, 0x84, 0x86, 0x2e, 0xf6,
+    0x51, 0x71, 0x0d, 0x6c, 0x45, 0x8f, 0x96, 0x15, 0x73, 0xab, 0xff, 0xc0,
+    0x87, 0x14, 0xba, 0x00, 0x67, 0x2c, 0x27, 0x03, 0xff, 0xa6, 0xe3, 0x09,
+    0xae, 0xbb, 0xa5, 0x49, 0xee, 0x5f, 0x47, 0xc0, 0x30, 0x4a, 0x93, 0x28,
+    0x48, 0x4d, 0x30, 0x49, 0xe7, 0xe6, 0x79, 0x96, 0x75, 0x6c, 0x62, 0xbc,
+    0x9f, 0xaa, 0x39, 0x63, 0x1d, 0x33, 0xce, 0xd2, 0xa3, 0xd1, 0x93, 0xed,
+    0x8d, 0xa6, 0xbd, 0x02, 0xf0, 0x44, 0xd5, 0x9e, 0x29, 0x02, 0x46, 0x87,
+    0xaf, 0xdb, 0xfb, 0x20, 0x29, 0x26, 0xb7, 0x8c, 0x75, 0xee, 0xe9, 0x29,
+    0x53, 0x01, 0x4a, 0xaa, 0xc2, 0x9f, 0x6c, 0x30, 0x21, 0x83, 0xa6, 0x09,
+    0x32, 0x1d, 0xaa, 0x00, 0x6c, 0xea, 0x9c, 0x84, 0x16, 0x16, 0x0c, 0x06,
+    0xcc, 0xf0, 0x19, 0xce, 0x57, 0xb3, 0x9f, 0x57, 0xf0, 0xdc, 0xda, 0x86,
+    0x85, 0x2f, 0x09, 0x33, 0x8d, 0x59, 0xb8, 0xc1, 0x08, 0x4c, 0xee, 0xf8,
+    0x33, 0x3d, 0x23, 0x13, 0x78, 0xa3, 0x98, 0xbf, 0xab, 0xef, 0x15, 0xe2,
+    0x8d, 0xdb, 0xb4, 0xd0, 0x4b, 0x2f, 0x04, 0x3f, 0x6b, 0x11, 0xf0, 0x05,
+    0xc7, 0x53, 0x1e, 0xc9, 0x73, 0x11, 0x81, 0xd3, 0xde, 0x21, 0xd8, 0x14,
+    0x10, 0xbe, 0x30, 0xb2, 0x48, 0x55, 0x9b, 0x8c, 0x10, 0x84, 0xce, 0xef,
+    0x83, 0x2f, 0x03, 0x10, 0x09, 0x0f, 0x70, 0xa8, 0x84, 0xea, 0x15, 0xdb,
+    0xc7, 0xdf, 0x6f, 0x67, 0x5d, 0x1c, 0xc7, 0x1a, 0x1c, 0x15, 0xa6, 0x92,
+    0xed, 0x63, 0xf0, 0xed, 0x77, 0x5d, 0x12, 0x1b, 0x8c, 0xab, 0x3e, 0xfa,
+    0x12, 0xf6, 0x83, 0xda, 0x41, 0xbc, 0x97, 0x76, 0xb9, 0x1f, 0xc9, 0x36,
+    0xc7, 0xe3, 0x9f, 0x93, 0x2e, 0x27, 0xdc, 0x90, 0x84, 0x6d, 0x81, 0x04,
+    0x09, 0x4f, 0x10, 0xb9, 0x53, 0xd9, 0x8f, 0x99, 0x2b, 0x8b, 0x53, 0x4f,
+    0xe8, 0x3e, 0x82, 0x1b, 0x0c, 0x3d, 0xbc, 0xe5, 0x5c, 0x13, 0xed, 0x4b,
+    0x0b, 0x05, 0x72, 0xaa, 0xd2, 0xcf, 0xfc, 0x9f, 0xd0, 0xfd, 0xc7, 0xc6,
+    0xc0, 0xa3, 0xa7, 0x05, 0xbb, 0x9e, 0xae, 0x63, 0xc0, 0x3d, 0x73, 0x92,
+    0xe1, 0x98, 0xe4, 0xa5, 0xb3, 0xc4, 0x36, 0x90, 0x35, 0x6b, 0xab, 0x35,
+    0x06, 0x98, 0xca, 0x35, 0x20, 0x5a, 0x6a, 0x84, 0x5c, 0x88, 0xca, 0x64,
+    0x43, 0x87, 0xf2, 0x3c, 0x13, 0x58, 0x1c, 0x35, 0x2c, 0xf2, 0x1d, 0x5e,
+    0xe0, 0x1b, 0x2c, 0x59, 0xc2, 0xcd, 0xf2, 0x96, 0x1a, 0x75, 0x3c, 0x10,
+    0xe7, 0xe3, 0xa1, 0xbc, 0xec, 0x03, 0x79, 0x58, 0x26, 0x4d, 0xcf, 0xb4,
+    0x00, 0xd3, 0x46, 0xee, 0x99, 0x52, 0x2f, 0x54, 0xcb, 0xa1, 0x75, 0xa1,
+    0xa0, 0xf4, 0xaa, 0xe9, 0x4a, 0xe1, 0x74, 0xcc, 0xd1, 0x47, 0xda, 0x48,
+    0x8b, 0x2e, 0xf9, 0x54, 0x98, 0x4e, 0x4f, 0x5a, 0x1b, 0xf5, 0x66, 0x62,
+    0xa0, 0xc2, 0x0e, 0x1a, 0x91, 0xbd, 0x7a, 0x33, 0xfd, 0x7c, 0xfc, 0x8b,
+    0xc0, 0x92, 0xd8, 0x97, 0x48, 0x6f, 0xf4, 0xe0, 0x6c, 0xcf, 0x17, 0xc9,
+    0x44, 0x04, 0xcf, 0x50, 0x0d, 0x8f, 0xbc, 0x4f, 0x4e, 0x1d, 0x38, 0x38,
+    0x5c, 0xb7, 0x8e, 0xe7, 0x52, 0xbe, 0x04, 0x68, 0x79, 0x9e, 0x68, 0x32,
+    0x3b, 0xe4, 0xee, 0x65, 0x76, 0xf6, 0xb4, 0x47, 0x1c, 0xa5, 0xd0, 0x20,
+    0x0f, 0x94, 0xe1, 0x2f, 0xa8, 0x87, 0xeb, 0xda, 0x2c, 0x54, 0xc4, 0x07,
+    0x08, 0x89, 0xdc, 0xcf, 0x73, 0x0c, 0x1f, 0xea, 0xb4, 0x6d, 0xea, 0x17,
+    0x70, 0x82, 0xb5, 0x18, 0x2f, 0x38, 0xc5, 0x47, 0x47, 0xd6, 0x37, 0x20,
+    0x8d, 0x71, 0xd6, 0x16, 0x4d, 0x16, 0xd5, 0x77, 0x36, 0xb5, 0xd0, 0x20,
+    0x5f, 0x4d, 0x89, 0x6c, 0x49, 0xc4, 0x13, 0x6c, 0x26, 0x8c, 0x8f, 0x6f,
+    0x17, 0xab, 0xdf, 0x57, 0xa8, 0xab, 0xed, 0x8d, 0xa9, 0x00, 0x6b, 0xfc,
+    0xf6, 0x72, 0xaf, 0x32, 0xc2, 0x0b, 0xb6, 0x6b, 0x7a, 0xac, 0xa9, 0x77,
+    0x52, 0x87, 0x98, 0x43, 0x21, 0x72, 0x35, 0x6c, 0x27, 0x12, 0xbe, 0xf0,
+    0x62, 0x16, 0x2a, 0xc6, 0xf7, 0x48, 0xd2, 0xc3, 0x25, 0xb4, 0x6a, 0x57,
+    0x65, 0xd6, 0x07, 0xa0, 0xde, 0x9f, 0x3b, 0x3d, 0xdd, 0x27, 0x0e, 0x4c,
+    0xe8, 0x4b, 0xe1, 0xd6, 0x33, 0xa7, 0x85, 0x75, 0x44, 0x7e, 0xf9, 0xfd,
+    0xb9, 0x98, 0xa8, 0x30, 0x82, 0xdf, 0xd9, 0x97, 0x5c, 0x3f, 0x52, 0x20,
+    0xd4, 0x38, 0x88, 0xc1, 0x53, 0x11, 0x14, 0x25, 0x6f, 0xeb, 0x4e, 0xf5,
+    0xed, 0xf4, 0xba, 0x34, 0x23, 0x74, 0xbc, 0x46, 0x51, 0x96, 0x1b, 0x50,
+    0x32, 0x03, 0xe5, 0x6d, 0xd7, 0xcf, 0xca, 0x60, 0xb2, 0xbc, 0xb6, 0x4b,
+    0xc0, 0xee, 0x8b, 0x96, 0xa9, 0x4c, 0x1d, 0x9b, 0x2d, 0x11, 0xc7, 0x29,
+    0x74, 0x08, 0x03, 0xe5, 0x1c, 0xe2, 0x6c, 0x21, 0x1e, 0x02, 0x4d, 0xb1,
+    0x4e, 0x70, 0xb3, 0xfc, 0x06, 0xa5, 0xf9, 0xfb, 0x35, 0x1c, 0x89, 0xe3,
+    0x1e, 0x27, 0xe0, 0x93, 0xd6, 0xd5, 0x15, 0x94, 0x40, 0x88, 0x71, 0xfd,
+    0xaa, 0xbd, 0xf6, 0xae, 0x61, 0x52, 0x49, 0x33, 0x99, 0x85, 0xcd, 0x13,
+    0x70, 0x7e, 0x1b, 0x76, 0x3a, 0x69, 0x9e, 0xfe, 0x3c, 0x65, 0x22, 0xf0,
+    0x1f, 0x91, 0x57, 0x00, 0x5b, 0x28, 0xac, 0x1e, 0x1e, 0x24, 0xc7, 0xd8,
+    0xdb, 0x3a, 0xd0, 0x85, 0x04, 0x4d, 0xf7, 0xe8, 0x3b, 0xdc, 0xa1, 0x5b,
+    0x5e, 0xe3, 0x7a, 0xae, 0x72, 0x70, 0x7c, 0x52, 0x07, 0xf5, 0x1c, 0xda,
+    0xd7, 0x40, 0x81, 0x7d, 0x36, 0x0a, 0x97, 0x8e, 0x0c, 0x25, 0xe7, 0xd3,
+    0x81, 0xb0, 0xe2, 0xd0, 0x56, 0x16, 0x9c, 0x9d, 0x0e, 0xc7, 0x97, 0x8f,
+    0xff, 0x68, 0xd4, 0x4f, 0x1a, 0x4c, 0x58, 0x6f, 0xe4, 0xd5, 0xc1, 0x07,
+    0x7f, 0x31, 0x8c, 0x59, 0x02, 0x6f, 0xa7, 0x54, 0x1b, 0x02, 0x35, 0xe5,
+    0x14, 0xec, 0x35, 0x3d, 0x17, 0x72, 0x11, 0x0c, 0x38, 0x62, 0x99, 0x4a,
+    0x6a, 0x46, 0xcb, 0x36, 0x1b, 0x4b, 0x38, 0xff, 0x1d, 0xa4, 0xf7, 0x21,
+    0xda, 0x73, 0x42, 0xc4, 0x2b, 0xf8, 0xd8, 0x43, 0x73, 0x60, 0x11, 0x22,
+    0xc9, 0xe6, 0x07, 0xca, 0xa0, 0x29, 0x2a, 0x20, 0xd9, 0xdd, 0x7d, 0xed,
+    0x28, 0x10, 0xde, 0xbe, 0x5e, 0xfd, 0x0c, 0x06, 0x4b, 0x1c, 0xc4, 0x56,
+    0xc4, 0x12, 0x25, 0x5a, 0xd1, 0xfe, 0x03, 0x5e, 0x5e, 0xe0, 0x42, 0x8e,
+    0x44, 0xf1, 0x8f, 0x13, 0xf0, 0x49, 0xeb, 0x59, 0xf3, 0x5b, 0x61, 0xd9,
+    0xa4, 0xdf, 0x2e, 0x2a, 0x70, 0xc2, 0xf0, 0xef, 0x16, 0xf4, 0x1b, 0x5c,
+    0xbd, 0x77, 0x42, 0xb9, 0x4c, 0x56, 0x8d, 0xc8, 0xf8, 0x05, 0xbd, 0x52,
+    0xba, 0x6e, 0xe1, 0x89, 0xe1, 0xf2, 0xdb, 0xa7, 0xdf, 0xe0, 0xee, 0xc1,
+    0x5c, 0x9e, 0x90, 0x11, 0x17, 0xd5, 0xc1, 0xb9, 0x2c, 0x08, 0x62, 0x0d,
+    0x75, 0x05, 0xb2, 0xad, 0x22, 0xd6, 0x5c, 0x6e, 0xed, 0xa4, 0x06, 0x5a,
+    0x42, 0x4f, 0xbf, 0x84, 0x53, 0xfa, 0x0b, 0xb7, 0x47, 0x6c, 0xba, 0x07,
+    0xc9, 0xe4, 0x8c, 0xe4, 0xa3, 0x40, 0xdc, 0xcb, 0x58, 0xeb, 0xba, 0xc5,
+    0xcc, 0x56, 0x74, 0x1e, 0x7b, 0x0f, 0x2a, 0xce, 0x35, 0x46, 0x39, 0x6d,
+    0x81, 0x91, 0xb2, 0x05, 0x76, 0xfa, 0x8f, 0x43, 0x46, 0x25, 0xb7, 0x98,
+    0x4e, 0x5f, 0x63, 0xf4, 0x0e, 0x4f, 0x5d, 0x85, 0x29, 0x9d, 0xdb, 0xa8,
+    0xeb, 0x0a, 0xbb, 0xc4, 0xf8, 0x5a, 0xda, 0xe1, 0x9b, 0x1f, 0x9b, 0x4d,
+    0x62, 0x65, 0x41, 0x34, 0x5b, 0x6c, 0x19, 0xa5, 0x3c, 0x35, 0x8e, 0x14,
+    0x02, 0xcd, 0x1d, 0xf3, 0xfb, 0x70, 0x93, 0x46, 0xe2, 0x49, 0xc8, 0x31,
+    0xfd, 0x47, 0x35, 0xfc, 0x7d, 0xb9, 0x79, 0xf7, 0x0d, 0xed, 0x98, 0x47,
+    0xd2, 0xcf, 0x26, 0x8b, 0x10, 0x6f, 0x86, 0xca, 0xda, 0xb8, 0x41, 0xdb,
+    0x0c, 0xc7, 0xc3, 0x56, 0xc5, 0x0f, 0xc7, 0xf2, 0xda, 0x45, 0xdf, 0x94,
+    0xc1, 0x65, 0x79, 0x6c, 0x97, 0x81, 0xbd, 0xf1, 0x1e, 0x26, 0x6e, 0xfc,
+    0x4f, 0x2e, 0x1e, 0x9c, 0xa2, 0x69, 0x54, 0x7a, 0xc3, 0x15, 0x44, 0x64,
+    0x73, 0x11, 0x5b, 0x10, 0x48, 0x95, 0x6b, 0x49, 0x4e, 0xcb, 0x2b, 0x12,
+    0x90, 0xaf, 0xf5, 0x5a, 0xfa, 0xf5, 0x0b, 0xb8, 0x49, 0x0a, 0x7d, 0xc4,
+    0x6b, 0x0a, 0xa5, 0x6d, 0x32, 0xb2, 0x33, 0x3c, 0xb3, 0x65, 0x9c, 0x1f,
+    0x7e, 0x50, 0xd3, 0x6a, 0xa2, 0xc1, 0xb9, 0xd9, 0xfa, 0x25, 0xfe, 0x1c,
+    0x3f, 0x88, 0x47, 0x0a, 0x7e, 0x62, 0xa2, 0xf3, 0x3e, 0xae, 0x9f, 0x7f,
+    0x83, 0xbb, 0x05, 0x72, 0x7a, 0x40, 0x44, 0x5f, 0x57, 0x06, 0xe4, 0xb0,
+    0x21, 0x88, 0x35, 0xd4, 0x16, 0xca, 0xb4, 0x8b, 0x59, 0x71, 0xbb, 0xb6,
+    0x90, 0x19, 0x69, 0x09, 0x3e, 0xfe, 0x11, 0x4f, 0xe8, 0x2e, 0xdd, 0x1d,
+    0xb2, 0xe8, 0x1f, 0x27, 0x92, 0x33, 0x92, 0x8d, 0x04, 0x2e, 0x19, 0x16,
+    0xb4, 0xb5, 0xcf, 0x52, 0x98, 0xcc, 0x2b, 0x85, 0x0c, 0x2d, 0x88, 0x38,
+    0x24, 0x06, 0xf2, 0x47, 0xec, 0xce, 0xc6, 0xf7, 0x4e, 0xe4, 0x8b, 0xb5,
+    0x4f, 0xbe, 0xae, 0x13, 0xd5, 0x0c, 0xe6, 0x13, 0x44, 0xa4, 0x76, 0x19,
+    0x8c, 0x25, 0x28, 0x0f, 0x15, 0x8e, 0xa6, 0x9c, 0xee, 0x6e, 0xf0, 0x55,
+    0x9d, 0x5a, 0x8f, 0xf6, 0x08, 0x27, 0x92, 0x1f, 0xcb, 0x4c, 0x8c, 0x2c,
+    0xeb, 0x44, 0x26, 0x48, 0xec, 0x2e, 0x9b, 0xb3, 0xd9, 0x17, 0xee, 0x52,
+    0x7d, 0x32, 0x47, 0x88, 0x4d, 0xf9, 0x11, 0xfc, 0xac, 0xa3, 0xb0, 0xc9,
+    0x5e, 0x38, 0xa3, 0x8d, 0x56, 0xc8, 0x83, 0x7c, 0x53, 0x38, 0xe1, 0xd0,
+    0x28, 0x7d, 0xc1, 0x65, 0x99, 0x39, 0x58, 0x36, 0xa3, 0x66, 0x71, 0x4c,
+    0x28, 0xcb, 0x9f, 0xb5, 0x58, 0x4b, 0xa3, 0x5c, 0x4e, 0xf9, 0x8d, 0x5b,
+    0x0c, 0xf1, 0x32, 0xbb, 0xe3, 0xb4, 0x47, 0xe8, 0x1c, 0x9e, 0xbb, 0x0a,
+    0x53, 0x3b, 0xb7, 0x51, 0xd6, 0x15, 0x77, 0x89, 0xf0, 0xb5, 0xba, 0x71,
+    0x84, 0x16, 0x81, 0xb0, 0xdf, 0x67, 0x12, 0x9f, 0xe7, 0x43, 0x70, 0x3a,
+    0xb1, 0xdc, 0x40, 0x31, 0xe7, 0xdd, 0x6b, 0x74, 0xfc, 0x18, 0x7d, 0x0d,
+    0xba, 0xda, 0x67, 0x66, 0x56, 0x43, 0x42, 0x80, 0xc6, 0x7c, 0xb3, 0x6c,
+    0x89, 0x2e, 0xc7, 0x0d, 0x97, 0x8a, 0xbe, 0x1a, 0x36, 0x05, 0x10, 0x85,
+    0x96, 0xa8, 0xbd, 0x29, 0x85, 0x52, 0xdc, 0xa3, 0x92, 0x20, 0xa1, 0xb0,
+    0x45, 0x5a, 0x7e, 0xc3, 0x4c, 0x0b, 0x6f, 0x3a, 0xe4, 0xfe, 0x55, 0x01,
+    0x49, 0x51, 0x06, 0xe7, 0xbb, 0x91, 0xd2, 0x77, 0x80, 0x1e, 0x07, 0xc7,
+    0xe8, 0x60, 0x32, 0x58, 0xe6, 0x22, 0xb6, 0x20, 0x91, 0x2a, 0xd6, 0x92,
+    0x9d, 0x96, 0x56, 0x25, 0x21, 0x5f, 0xea, 0xb5, 0xf5, 0xea, 0x17, 0x70,
+    0x92, 0x14, 0xfb, 0x88, 0xd6, 0x15, 0x4a, 0xda, 0x65, 0x64, 0x66, 0x79,
+    0x66, 0xcb, 0x38, 0x3e, 0xfc, 0xa1, 0xa0, 0x96, 0xf7, 0xb0, 0x4d, 0x87,
+    0x80, 0x05, 0x1e, 0x85, 0xd8, 0xb8, 0xf8, 0x50, 0x3e, 0x9d, 0xc1, 0x83,
+    0x81, 0x15, 0x59, 0x5d, 0x49, 0xd0, 0xed, 0x25, 0x2a, 0xf3, 0x59, 0xe4,
+    0xc6, 0x4b, 0xc2, 0x0f, 0x19, 0x92, 0x2f, 0x7f, 0x96, 0xd0, 0x90, 0x08,
+    0xef, 0x4f, 0x57, 0xa5, 0x3e, 0xec, 0xbe, 0xa5, 0x31, 0xd5, 0xcb, 0xbb,
+    0xab, 0xde, 0x3b, 0xc8, 0x62, 0x8e, 0x35, 0x5b, 0x22, 0x0d, 0xf1, 0x4c,
+    0xe3, 0x87, 0x40, 0xa1, 0xf7, 0x05, 0x96, 0x64, 0xe5, 0x60, 0xda, 0x8d,
+    0x99, 0xc5, 0x30, 0xa3, 0x2e, 0x7e, 0xd5, 0x61, 0x2e, 0x8d, 0x71, 0x3b,
+    0xe6, 0x35, 0x6c, 0x33, 0xc4, 0xca, 0xef, 0x8e, 0xd1, 0x1f, 0xa0, 0x72,
+    0x7a, 0xec, 0x29, 0x4c, 0xee, 0xdd, 0x47, 0x58, 0x55, 0xde, 0x27, 0xc2,
+    0xd6, 0xe9, 0xc6, 0x10, 0x5a, 0x06, 0xc3, 0x7d, 0x9c, 0x4a, 0x7f, 0x9d,
+    0x0d, 0xc0, 0xea, 0xc7, 0x71, 0x00, 0xc7, 0x9f, 0x75, 0xad, 0xd3, 0xf0,
+    0x61, 0xf4, 0x36, 0xeb, 0x69, 0x9d, 0x99, 0x59, 0x0d, 0x0a, 0x03, 0x19,
+    0xf2, 0xcd, 0xb2, 0x24, 0xbb, 0x1c, 0x36, 0x5e, 0x2a, 0xf8, 0x68, 0xd8,
+    0x14, 0x42, 0x16, 0x5a, 0xa2, 0xf4, 0xa6, 0x15, 0x4b, 0x72, 0x8e, 0x48,
+    0x82, 0x86, 0xc1, 0x15, 0x69, 0xfb, 0x0d, 0x30, 0x2d, 0xbc, 0xeb, 0x93,
+    0xf9, 0x54, 0x05, 0x25, 0x44, 0x1b, 0x9e, 0xee, 0x47, 0x49, 0xde, 0x00,
+    0x78, 0x1f, 0x1f, 0xa1, 0x80, 0xc9, 0x63, 0x98, 0x8a, 0xd8, 0x82, 0x44,
+    0xab, 0x5a, 0x4a, 0x76, 0x59, 0x58, 0x94, 0x85, 0x7f, 0xaa, 0xd7, 0xd7,
+    0xa8, 0x5d, 0xc2, 0x48, 0x53, 0xee, 0x23, 0x58, 0x55, 0x2b, 0x69, 0x95,
+    0x91, 0x99, 0xe5, 0x9b, 0x2c, 0xe0, 0xfb, 0xf2, 0x86, 0x82, 0x5b, 0xde,
+    0xc1, 0x36, 0x1e, 0x00, 0x14, 0x7a, 0x17, 0x62, 0xe3, 0xe1, 0x40, 0xfa,
+    0x77, 0x06, 0x0e, 0x04, 0x55, 0x65, 0x75, 0x27, 0x43, 0xb4, 0x94, 0xab,
+    0xcd, 0x67, 0x93, 0x19, 0x2f, 0x08, 0x3c, 0x66, 0x48, 0xbd, 0xfe, 0x5b,
+    0x42, 0x40, 0x23, 0xbd, 0x3d, 0x5e, 0x94, 0xfb, 0xb2, 0xfa, 0x94, 0xc7,
+    0x57, 0x2e, 0xee, 0xaf, 0x78, 0xef, 0x21, 0x8a, 0x38, 0xd5, 0x6c, 0x88,
+    0x37, 0xc5, 0x33, 0x8e, 0x1d, 0x02, 0x87, 0xdc, 0x16, 0x59, 0x93, 0x95,
+    0x83, 0x6a, 0x36, 0x67, 0x14, 0xc2, 0x8c, 0xb9, 0xfb, 0x55, 0x84, 0xba,
+    0x35, 0xc4, 0xef, 0x98, 0xd5, 0xb0, 0xcf, 0x13, 0x2b, 0xbe, 0x3b, 0x44,
+    0x7e, 0x81, 0xc9, 0xeb, 0xb0, 0xa5, 0x33, 0xbb, 0x75, 0x1d, 0x61, 0x57,
+    0x78, 0x9f, 0x0b, 0x5b, 0xa7, 0x18, 0x41, 0x68, 0x1b, 0x0d, 0xf6, 0x71,
+    0x29, 0xfe, 0x74, 0x37, 0x03, 0xab, 0x1d, 0xc4, 0x03, 0x1e, 0x7d, 0xd6,
+    0xb7, 0x4f, 0xc1, 0x87, 0xd0, 0xdb, 0xad, 0xa6, 0x76, 0x65, 0x64, 0x34,
+    0x28, 0x0c, 0x67, 0xcb, 0x36, 0xc8, 0x92, 0xec, 0x70, 0xd9, 0x78, 0xab,
+    0xe1, 0xa3, 0x60, 0x51, 0x08, 0x59, 0x6a, 0x8b, 0xd2, 0x98, 0x55, 0x2d,
+    0xca, 0x39, 0x22, 0x0a, 0x1b, 0x04, 0x55, 0xa7, 0xec, 0x34, 0xc0, 0xb6,
+    0xf3, 0xae, 0x4f, 0xe5, 0x50, 0x14, 0x95, 0x10, 0x6e, 0x7b, 0xb9, 0x1d,
+    0x27, 0x78, 0x01, 0xe0, 0x7c, 0x7e, 0x86, 0x03, 0x25, 0x8e, 0x62, 0x2b,
+    0x62, 0x09, 0x12, 0xad, 0x69, 0x29, 0xd9, 0x65, 0x62, 0x52, 0x15, 0xfe,
+    0xab, 0x5f, 0x5e, 0xa1, 0x77, 0x09, 0x21, 0x4f, 0xb8, 0x8d, 0x61, 0x54,
+    0xad, 0xa6, 0x56, 0x46, 0x67, 0x96, 0x6c, 0xb3, 0x83, 0xef, 0xca, 0x1a,
+    0x09, 0x6f, 0x7b, 0x04, 0xd8, 0x78, 0x00, 0x51, 0xe8, 0x5d, 0x8b, 0x8f,
+    0x85, 0x03, 0xe9, 0xdc, 0x18, 0x38, 0x11, 0x55, 0x95, 0xd4, 0x9d, 0x0e,
+    0xd2, 0x52, 0xaf, 0x35, 0x9e, 0x4c, 0x64, 0xbc, 0x20, 0xf1, 0x99, 0x22,
+    0xf7, 0xf9, 0x6d, 0x09, 0x00, 0x8e, 0xf4, 0xf5, 0x7a, 0x53, 0xee, 0xcb,
+    0xea, 0x53, 0x1d, 0x5c, 0xbb, 0xba, 0xbd, 0xe3, 0xbc, 0x86, 0x28, 0xe3,
+    0x55, 0xb2, 0x20, 0xdf, 0x14, 0xce, 0x38, 0x74, 0x0a, 0x1f, 0x70, 0x59,
+    0x66, 0x4e, 0x56, 0x0d, 0xa8, 0xd9, 0x9c, 0x53, 0x0a, 0x32, 0xe7, 0xed,
+    0x56, 0x12, 0xe8, 0xd7, 0x13, 0xbe, 0x63, 0x56, 0xc3, 0x3c, 0x4c, 0xae,
+    0xf8, 0xed, 0x11, 0xfa, 0x07, 0x27, 0xae, 0xc2, 0x94, 0xce, 0xed, 0xd4,
+    0x75, 0x85, 0x5d, 0xe2, 0x7c, 0x2d, 0x6e, 0x9c, 0x61, 0x05, 0xa0, 0x6c,
+    0x37, 0xd9, 0xc4, 0xa7, 0xf9, 0xd0, 0xdc, 0x0e, 0xac, 0x77, 0x10, 0x0c,
+    0x79, 0xf7, 0x5a, 0xdd, 0x3f, 0x06, 0x1f, 0x43, 0x6e, 0xb6, 0x99, 0xd9,
+    0x95, 0x90, 0xd0, 0xa0, 0x31, 0x9f, 0x2c, 0xdb, 0x22, 0x4b, 0xb1, 0xc3,
+    0x65, 0xe2, 0xaf, 0x86, 0x8d, 0x81, 0x44, 0x21, 0x65, 0xaa, 0x2f, 0x4a,
+    0x61, 0x54, 0xb7, 0x28, 0xe4, 0x88, 0x28, 0x6c, 0x11, 0x56, 0x9f, 0xb0,
+    0xd3, 0x02, 0xdb, 0xce, 0xb9, 0x3f, 0x95, 0x40, 0x52, 0x54, 0x41, 0xb9,
+    0xee, 0xe4, 0x74, 0x9d, 0xe0, 0x07, 0x81, 0xf1, 0xfa, 0x18, 0x0c, 0x96,
+    0x39, 0x88, 0xad, 0x88, 0x24, 0x4a, 0xb5, 0xa4, 0xa7, 0x65, 0x95, 0x89,
+    0x48, 0x57, 0xfa, 0xad, 0x7d, 0x7a, 0x85, 0xdc, 0x24, 0x85, 0x3e, 0xe2,
+    0x35, 0x85, 0x52, 0xb6, 0x99, 0x59, 0x19, 0x9e, 0x59, 0xb2, 0xce, 0x0f,
+    0xbf, 0x28, 0x68, 0x25, 0xbd, 0xec, 0x13, 0x61, 0xe0, 0x01, 0x47, 0xa1,
+    0x76, 0x2e, 0x3e, 0x14, 0x0f, 0xa7, 0x70, 0x60, 0xe0, 0x45, 0x56, 0x57,
+    0x52, 0x74, 0x3b, 0x49, 0x4a, 0xbc, 0xd6, 0x79, 0x31, 0x92, 0xf0, 0x83,
+    0xc6, 0x64, 0x8b, 0xdf, 0xe5, 0xb4, 0x24, 0x02, 0x3b, 0xd3, 0xd5, 0xe9,
+    0x4f, 0xbb, 0x2f, 0xa9, 0x4c, 0x75, 0x72, 0xee, 0xea, 0xf7, 0x8e, 0xf2,
+    0x18, 0xa3, 0x8d, 0x56, 0xc8, 0x83, 0x7c, 0x53, 0x38, 0xe1, 0xd0, 0x28,
+    0x7d, 0xc1, 0x65, 0x99, 0x39, 0x58, 0x36, 0xa3, 0x66, 0x71, 0x4c, 0x28,
+    0xcb, 0x9f, 0xb5, 0x58, 0x4b, 0xa3, 0x5c, 0x4e, 0xf9, 0x8d, 0x5b, 0x0c,
+    0xf1, 0x32, 0xbb, 0xe3, 0xb4, 0x47, 0xe8, 0x1c, 0x9e, 0xbb, 0x0a, 0x53,
+    0x3b, 0xb7, 0x51, 0xd6, 0x15, 0x77, 0x89, 0xf0, 0xb5, 0xba, 0x71, 0x84,
+    0x16, 0x81, 0xb0, 0xdf, 0x67, 0x12, 0x9f, 0xe7, 0x43, 0x70, 0x3a, 0xb1,
+    0xdc, 0x40, 0x31, 0xe7, 0xdd, 0x6b, 0x74, 0xfc, 0x18, 0x7d, 0x0d, 0xba,
+    0xda, 0x67, 0x66, 0x56, 0x43, 0x42, 0x80, 0xc6, 0x7c, 0xb3, 0x6c, 0x89,
+    0x2e, 0xc7, 0x0d, 0x97, 0x8a, 0xbe, 0x1a, 0x36, 0x05, 0x10, 0x85, 0x96,
+    0xa8, 0xbd, 0x29, 0x85, 0x52, 0xdc, 0xa3, 0x92, 0x20, 0xa1, 0xb0, 0x45,
+    0x5a, 0x7e, 0xc3, 0x4c, 0x0b, 0x6f, 0x3a, 0xe4, 0xfe, 0x55, 0x01, 0x49,
+    0x51, 0x06, 0xe7, 0xbb, 0x91, 0xd2, 0x77, 0x80, 0x1e, 0x07, 0xc7, 0xe8,
+    0x60, 0x32, 0x58, 0xe6, 0x22, 0xb6, 0x20, 0x91, 0x2a, 0xd6, 0x92, 0x9d,
+    0x96, 0x56, 0x25, 0x21, 0x5f, 0xea, 0xb5, 0xf5, 0xea, 0x17, 0x70, 0x92,
+    0x14, 0xfb, 0x88, 0xd6, 0x15, 0x4a, 0xda, 0x65, 0x64, 0x66, 0x79, 0x66,
+    0xcb, 0x38, 0x3e, 0xfc, 0xa1, 0xa0, 0x96, 0xf7, 0xb0, 0x4d, 0x87, 0x80,
+    0x05, 0x1e, 0x85, 0xd8, 0xb8, 0xf8, 0x50, 0x3e, 0x9d, 0xc1, 0x83, 0x81,
+    0x15, 0x59, 0x5d, 0x49, 0xd0, 0xed, 0x25, 0x2a, 0xf3, 0x59, 0xe4, 0xc6,
+    0x4b, 0xc2, 0x0f, 0x19, 0x92, 0x2f, 0x7f, 0x96, 0xd0, 0x90, 0x08, 0xef,
+    0x4f, 0x57, 0xa5, 0x3e, 0xec, 0xbe, 0xa5, 0x31, 0xd5, 0xcb, 0xbb, 0xab,
+    0xde, 0x3b, 0xc8, 0x62, 0x8e, 0x35, 0x5b, 0x22, 0x0d, 0xf1, 0x4c, 0xe3,
+    0x87, 0x40, 0xa1, 0xf7, 0x05, 0x96, 0x64, 0xe5, 0x60, 0xda, 0x8d, 0x99,
+    0xc5, 0x30, 0xa3, 0x2e, 0x7e, 0xd5, 0x61, 0x2e, 0x8d, 0x71, 0x3b, 0xe6,
+    0x35, 0x6c, 0x33, 0xc4, 0xca, 0xef, 0x8e, 0xd1, 0x1f, 0xa0, 0x72, 0x7a,
+    0xec, 0x29, 0x4c, 0xee, 0xdd, 0x47, 0x58, 0x55, 0xde, 0x27, 0xc2, 0xd6,
+    0xe9, 0xc6, 0x10, 0x5a, 0x06, 0xc3, 0x7d, 0x9c, 0x4a, 0x7f, 0x9d, 0x0d,
+    0xc0, 0xea, 0xc7, 0x71, 0x00, 0xc7, 0x9f, 0x75, 0xad, 0xd3, 0xf0, 0x61,
+    0xf4, 0x36, 0xeb, 0x69, 0x9d, 0x99, 0x59, 0x0d, 0x0a, 0x03, 0x19, 0xf2,
+    0xcd, 0xb2, 0x24, 0xbb, 0x1c, 0x36, 0x5e, 0x2a, 0xf8, 0x68, 0xd8, 0x14,
+    0x42, 0x16, 0x5a, 0xa2, 0xf4, 0xa6, 0x15, 0x4b, 0x72, 0x8e, 0x48, 0x82,
+    0x86, 0xc1, 0x15, 0x69, 0xfb, 0x0d, 0x30, 0x2d, 0xbc, 0xeb, 0x93, 0xf9,
+    0x54, 0x05, 0x25, 0x44, 0x1b, 0x9e, 0xee, 0x47, 0x49, 0xde, 0x00, 0x78,
+    0x1f, 0x1f, 0xa1, 0x80, 0xc9, 0x63, 0x98, 0x8a, 0xd8, 0x82, 0x44, 0xab,
+    0x5a, 0x4a, 0x76, 0x59, 0x58, 0x94, 0x85, 0x7f, 0xaa, 0xd7, 0xd7, 0xa8,
+    0x5d, 0xc2, 0x48, 0x53, 0xee, 0x23, 0x58, 0x55, 0x2b, 0x69, 0x95, 0x91,
+    0x99, 0xe5, 0x9b, 0x2c, 0xe0, 0xfb, 0xf2, 0x86, 0x82, 0x5b, 0xde, 0xc1,
+    0x36, 0x1e, 0x00, 0x14, 0x7a, 0x17, 0x62, 0xe3, 0xe1, 0x40, 0xfa, 0x77,
+    0x06, 0x0e, 0x04, 0x55, 0x65, 0x75, 0x27, 0x43, 0xb4, 0x94, 0xab, 0xcd,
+    0x67, 0x93, 0x19, 0x2f, 0x08, 0x3c, 0x66, 0x48, 0xbd, 0xfe, 0x5b, 0x42,
+    0x40, 0x23, 0xbd, 0x3d, 0x5e, 0x94, 0xfb, 0xb2, 0xfa, 0x94, 0xc7, 0x57,
+    0x2e, 0xee, 0xaf, 0x78, 0xef, 0x21, 0x8a, 0x38, 0xd5, 0x6c, 0x88, 0x37,
+    0xc5, 0x33, 0x8e, 0x1d, 0x02, 0x87, 0xdc, 0x16, 0x59, 0x93, 0x95, 0x83,
+    0x6a, 0x36, 0x67, 0x14, 0xc2, 0x8c, 0xb9, 0xfb, 0x55, 0x84, 0xba, 0x35,
+    0xc4, 0xef, 0x98, 0xd5, 0xb0, 0xcf, 0x13, 0x2b, 0xbe, 0x3b, 0x44, 0x7e,
+    0x81, 0xca,
+};
+static_assert(sizeof(kBytesTestReadSymbol13) == kNumBytesTestReadSymbol13, "");
+
+// The kBytesTestReadSymbol14[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][15] = {
+//   // pmf: 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14,
+//   // 1/14, 1/14, 1/14
+//   { 32768 - 2341, 32768 - 4681, 32768 - 7022, 32768 - 9362, 32768 - 11703,
+//     32768 - 14043, 32768 - 16384, 32768 - 18725, 32768 - 21065,
+//     32768 - 23406, 32768 - 25746, 32768 - 28087, 32768 - 30427, 0, 0 },
+//   // pmf: 3/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+//   // 2/28, 2/28, 1/28
+//   { 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533, 32768 - 12873,
+//     32768 - 15214, 32768 - 17554, 32768 - 19895, 32768 - 22235,
+//     32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0 },
+//   // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+//   // 2/28, 2/28, 3/28
+//   { 32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+//     32768 - 12873, 32768 - 15214, 32768 - 17554, 32768 - 19895,
+//     32768 - 22235, 32768 - 24576, 32768 - 26917, 32768 - 29257, 0, 0 },
+//   // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 3/28, 3/28, 2/28, 2/28, 2/28,
+//   // 2/28, 2/28, 1/28
+//   { 32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+//     32768 - 12873, 32768 - 16384, 32768 - 19895, 32768 - 22235,
+//     32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0 },
+// };
+// constexpr int kSymbols[28][4] = { { 0, 7, 13, 6 },    //
+//                                   { 1, 8, 12, 5 },    //
+//                                   { 2, 9, 11, 4 },    //
+//                                   { 3, 10, 10, 3 },   //
+//                                   { 4, 11, 9, 2 },    //
+//                                   { 5, 12, 8, 1 },    //
+//                                   { 6, 13, 7, 0 },    //
+//                                   { 7, 0, 6, 13 },    //
+//                                   { 8, 1, 5, 12 },    //
+//                                   { 9, 2, 4, 11 },    //
+//                                   { 10, 3, 3, 10 },   //
+//                                   { 11, 4, 2, 9 },    //
+//                                   { 12, 5, 1, 8 },    //
+//                                   { 13, 6, 0, 7 },    //
+//                                   { 0, 0, 13, 11 },   //
+//                                   { 2, 1, 12, 9 },    //
+//                                   { 4, 3, 10, 7 },    //
+//                                   { 6, 5, 8, 5 },     //
+//                                   { 8, 7, 6, 3 },     //
+//                                   { 10, 9, 4, 1 },    //
+//                                   { 12, 11, 2, 12 },  //
+//                                   { 1, 0, 13, 10 },   //
+//                                   { 3, 2, 11, 8 },    //
+//                                   { 5, 4, 9, 6 },     //
+//                                   { 7, 6, 7, 4 },     //
+//                                   { 9, 8, 5, 2 },     //
+//                                   { 11, 10, 3, 7 },   //
+//                                   { 13, 12, 1, 6 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 64; ++i) {
+//   for (int j = 0; j < 28; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 14);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol14 = 3455;
+constexpr uint8_t kBytesTestReadSymbol14[] = {
+    0x0a, 0xef, 0xeb, 0xb5, 0x78, 0x91, 0x0b, 0x9d, 0xee, 0x99, 0x14, 0x9c,
+    0xf4, 0x58, 0x86, 0xe8, 0x69, 0x7f, 0x06, 0x07, 0x60, 0xb0, 0x79, 0xbe,
+    0xea, 0xe5, 0x69, 0x1c, 0x67, 0x7a, 0x75, 0x91, 0x2f, 0x1d, 0x49, 0x4e,
+    0x15, 0x40, 0x56, 0x15, 0xa1, 0xff, 0x72, 0x2d, 0xa5, 0x40, 0x81, 0x21,
+    0x3d, 0x06, 0x78, 0xd2, 0x62, 0x8a, 0xf2, 0x63, 0x50, 0x9d, 0xbd, 0xa0,
+    0xd4, 0x14, 0x42, 0x76, 0x4f, 0x44, 0xbe, 0xb2, 0xa1, 0x0d, 0x4c, 0x75,
+    0xe4, 0x4a, 0xed, 0xf9, 0x7e, 0xb8, 0x7b, 0x5a, 0x26, 0x78, 0x5f, 0xe3,
+    0x86, 0x72, 0x64, 0x48, 0x76, 0x51, 0x7a, 0x77, 0x3b, 0xcf, 0xa2, 0x8d,
+    0x31, 0xec, 0xc1, 0xa7, 0xf9, 0x9a, 0x76, 0x00, 0x7c, 0x17, 0x40, 0x03,
+    0x12, 0xe8, 0xed, 0xbf, 0x39, 0xe2, 0xdd, 0x6d, 0xdc, 0xe2, 0x34, 0xdf,
+    0x0d, 0xa6, 0x86, 0x22, 0xca, 0x86, 0x5f, 0x57, 0x25, 0xc6, 0x57, 0x60,
+    0xc3, 0x06, 0xe9, 0xf0, 0x06, 0xd4, 0xc0, 0xb3, 0xfc, 0x5b, 0xcd, 0xa9,
+    0xc0, 0x51, 0x6e, 0x10, 0x0a, 0x5a, 0xfd, 0xbf, 0x92, 0xc8, 0x21, 0x0e,
+    0x83, 0x74, 0xfe, 0x01, 0xec, 0x24, 0x61, 0x9d, 0x9e, 0xb8, 0xb2, 0x04,
+    0xa7, 0xe9, 0xd6, 0xc7, 0x79, 0x5b, 0xaa, 0xdd, 0x94, 0x5d, 0x26, 0x61,
+    0x0b, 0xee, 0x66, 0xf4, 0xb2, 0xd1, 0x9b, 0xf0, 0xb4, 0x9b, 0x50, 0x4c,
+    0x4a, 0x57, 0xbc, 0xfe, 0x7e, 0xca, 0xfe, 0xa8, 0x22, 0x1b, 0x2f, 0x4a,
+    0x26, 0x32, 0x96, 0xfd, 0x03, 0x02, 0x1b, 0x7c, 0x1d, 0x6d, 0x42, 0x48,
+    0x2b, 0x11, 0x0d, 0x8f, 0x40, 0xb8, 0x15, 0xf1, 0xdd, 0x06, 0xf7, 0xa0,
+    0x1f, 0x0f, 0x75, 0xb1, 0x53, 0x73, 0x1f, 0xbf, 0x97, 0xf7, 0xa0, 0xcb,
+    0x5b, 0x98, 0xb7, 0x50, 0xa7, 0xc5, 0x23, 0x9b, 0x16, 0x0a, 0x2e, 0x03,
+    0x68, 0x3a, 0x92, 0x75, 0xb8, 0xb0, 0xd8, 0xda, 0x2e, 0x82, 0x61, 0x3f,
+    0xa0, 0x6e, 0x78, 0xe5, 0x7d, 0x14, 0xe5, 0x1f, 0x7b, 0xec, 0xb5, 0x14,
+    0xb7, 0xa0, 0x72, 0xdc, 0x1a, 0x23, 0xa4, 0x5b, 0xc5, 0xc2, 0x75, 0x6a,
+    0x7c, 0x36, 0xef, 0xf0, 0xd1, 0x5a, 0x34, 0x31, 0x0b, 0xae, 0x4c, 0x07,
+    0xc2, 0xb7, 0xab, 0xd5, 0x67, 0xed, 0x65, 0x5e, 0xa0, 0x7e, 0x16, 0x04,
+    0xc6, 0x1b, 0x74, 0x0f, 0xa9, 0x35, 0xe8, 0x71, 0x83, 0xca, 0xc3, 0x21,
+    0x74, 0xf5, 0xee, 0x71, 0xd1, 0x4c, 0xa2, 0x1d, 0xce, 0x16, 0x4b, 0x9b,
+    0xb0, 0x9f, 0x42, 0x08, 0x49, 0x6a, 0x82, 0x66, 0xe8, 0xb2, 0xce, 0xfd,
+    0x8e, 0xdb, 0x9e, 0x9e, 0xeb, 0x4b, 0x3d, 0xbb, 0xab, 0x61, 0xe4, 0x0d,
+    0x87, 0x8e, 0xe9, 0x7b, 0xe8, 0x57, 0x70, 0x8c, 0xab, 0x0c, 0x0f, 0x05,
+    0x4b, 0xca, 0x6d, 0xe7, 0x94, 0x2b, 0x29, 0x28, 0xfd, 0xfa, 0x11, 0x4c,
+    0x08, 0x51, 0xce, 0x45, 0x70, 0x87, 0x2b, 0xcf, 0x88, 0x80, 0x87, 0x38,
+    0x80, 0x5d, 0x2e, 0x8f, 0x47, 0xd8, 0x5e, 0x75, 0x66, 0xa7, 0x86, 0x5e,
+    0x98, 0xd4, 0x1b, 0x00, 0x11, 0xcf, 0x7b, 0xef, 0x8b, 0x17, 0x93, 0xe0,
+    0x3a, 0x90, 0x7d, 0x0b, 0x45, 0x34, 0x2a, 0x67, 0xa4, 0x0e, 0xab, 0xc3,
+    0x3b, 0x27, 0x68, 0x03, 0x4d, 0xcb, 0xd5, 0x87, 0x53, 0x37, 0xe5, 0xcc,
+    0xc3, 0x73, 0x4a, 0x2c, 0x5f, 0xdc, 0x8d, 0xba, 0x6c, 0x11, 0xa0, 0x35,
+    0xc6, 0xbe, 0xd9, 0xd6, 0x64, 0x2e, 0x4b, 0x85, 0xbf, 0x50, 0xdd, 0xa6,
+    0xa0, 0xa4, 0x23, 0xd7, 0x82, 0xb6, 0x65, 0x4e, 0xa8, 0xd4, 0x19, 0xa1,
+    0xe4, 0xc8, 0x4d, 0x69, 0x2a, 0x41, 0x4f, 0x1e, 0x46, 0xb1, 0xde, 0x64,
+    0x0b, 0xf8, 0x62, 0xfe, 0x27, 0xc5, 0x2e, 0x31, 0x0f, 0x40, 0xae, 0x64,
+    0x86, 0x2a, 0x36, 0x7e, 0x03, 0x01, 0x37, 0xf3, 0x36, 0x42, 0x3f, 0xaa,
+    0x0b, 0xdd, 0xa9, 0x3e, 0x09, 0xe2, 0xe9, 0xea, 0x15, 0x5b, 0x0d, 0x4b,
+    0xcc, 0x47, 0xa5, 0x24, 0xed, 0x0b, 0x3c, 0xb3, 0x6e, 0xc6, 0x1d, 0x47,
+    0x39, 0x30, 0xe6, 0xf6, 0xc7, 0xae, 0x6b, 0x25, 0x09, 0xce, 0xf2, 0x2f,
+    0xaf, 0x4d, 0x32, 0xac, 0x4f, 0xa4, 0xff, 0x39, 0x48, 0xbb, 0xe6, 0xdf,
+    0x93, 0x41, 0x00, 0x2a, 0x82, 0xd9, 0x81, 0x79, 0xc4, 0x65, 0xf3, 0x62,
+    0x17, 0x18, 0x37, 0xcf, 0xa0, 0xaa, 0xe5, 0xc6, 0x97, 0x84, 0x14, 0x1c,
+    0x7e, 0x36, 0x72, 0xe2, 0x35, 0x84, 0x39, 0x43, 0x7b, 0xbf, 0xaf, 0x94,
+    0x9a, 0xa2, 0xeb, 0xf9, 0xc4, 0x5c, 0x49, 0x5a, 0xef, 0x6b, 0xe6, 0x19,
+    0x0e, 0xac, 0x08, 0x43, 0x4d, 0x5a, 0x14, 0x7e, 0x27, 0x4a, 0xd1, 0x4a,
+    0x9b, 0x3f, 0xdc, 0x98, 0x5a, 0xcb, 0x40, 0x90, 0xdf, 0x56, 0xa1, 0x76,
+    0x12, 0x71, 0xe1, 0x20, 0x5e, 0xf1, 0xaa, 0xd7, 0xba, 0x6c, 0xfb, 0x1d,
+    0x20, 0xfe, 0xa0, 0x41, 0x65, 0x09, 0x5f, 0x8b, 0xde, 0x20, 0xb7, 0x26,
+    0xd5, 0xce, 0x83, 0x14, 0x0d, 0x28, 0x36, 0x86, 0xe1, 0x02, 0x86, 0xde,
+    0xf3, 0xc6, 0x44, 0x10, 0x04, 0x84, 0x9f, 0x18, 0x9b, 0xf1, 0x0a, 0xca,
+    0x41, 0x53, 0xa9, 0xa9, 0x6b, 0xa5, 0x95, 0x22, 0x1d, 0x17, 0x3b, 0xc0,
+    0x5f, 0xb7, 0x5e, 0xac, 0x73, 0x4e, 0x76, 0xaf, 0x4c, 0xb4, 0x4f, 0xf6,
+    0x3f, 0xa1, 0x20, 0x2e, 0xf7, 0xa8, 0x14, 0x0d, 0xc3, 0x50, 0x97, 0x25,
+    0xe0, 0xc4, 0x5c, 0x3e, 0xe6, 0xbe, 0xe9, 0xa4, 0x1e, 0x1d, 0xdb, 0x06,
+    0xc1, 0x15, 0xf2, 0x6d, 0xbf, 0x71, 0xf2, 0x0b, 0xd9, 0x75, 0x4b, 0x38,
+    0xf5, 0xe2, 0x69, 0x0d, 0x93, 0xa5, 0x8e, 0x4c, 0xc5, 0x2a, 0xb6, 0x45,
+    0x60, 0x77, 0xd6, 0x14, 0x39, 0x5e, 0x70, 0x9e, 0x8d, 0x07, 0x20, 0x1c,
+    0x05, 0xc9, 0xb0, 0x46, 0xf7, 0x6c, 0x3e, 0xf8, 0xf8, 0x0a, 0xad, 0x0b,
+    0x22, 0x5e, 0x32, 0xbd, 0x46, 0xbc, 0x06, 0x7b, 0x92, 0x36, 0x5a, 0x2b,
+    0xac, 0x68, 0x2d, 0x5a, 0xf4, 0xc2, 0x61, 0xe3, 0x9d, 0xf4, 0x5d, 0x59,
+    0x59, 0x98, 0xb7, 0x5a, 0x73, 0x08, 0xf6, 0x4f, 0x0a, 0x75, 0x04, 0x93,
+    0xc1, 0xe1, 0x9b, 0xe0, 0xb0, 0x2a, 0xf7, 0xdd, 0x8b, 0xae, 0xf5, 0x55,
+    0x28, 0x6b, 0x21, 0x9b, 0x02, 0x43, 0xbd, 0x36, 0x4d, 0xa5, 0x17, 0xbb,
+    0x97, 0xd4, 0x78, 0x1f, 0xe8, 0xd9, 0x98, 0x0e, 0x41, 0x96, 0x52, 0xab,
+    0xad, 0x91, 0x92, 0xae, 0x62, 0x5c, 0xe7, 0xeb, 0x24, 0x1b, 0xe8, 0x2a,
+    0xb2, 0xe8, 0xdc, 0x34, 0x7f, 0xe9, 0xa1, 0x4c, 0x4c, 0x13, 0xeb, 0x31,
+    0x29, 0xc3, 0xc4, 0xf5, 0xb4, 0x50, 0xb1, 0x8b, 0x08, 0xc3, 0x30, 0xf8,
+    0x40, 0xd8, 0x76, 0xd5, 0x4d, 0xf0, 0xc2, 0xd8, 0x67, 0x75, 0x01, 0x81,
+    0x2a, 0xe0, 0x6b, 0xc0, 0xf5, 0x30, 0x55, 0xb6, 0xa9, 0x52, 0x19, 0xc4,
+    0x73, 0x78, 0xc4, 0x9e, 0x13, 0x5f, 0xa7, 0x56, 0xb4, 0x07, 0x2c, 0x92,
+    0x85, 0x66, 0x5d, 0x00, 0x47, 0x32, 0x3c, 0x8b, 0xbf, 0x86, 0x9e, 0xe2,
+    0xfd, 0xf1, 0xf0, 0x15, 0x5a, 0x16, 0x44, 0xbc, 0x65, 0x7a, 0x8d, 0x78,
+    0x0c, 0xf9, 0x94, 0x1d, 0x83, 0x7c, 0xee, 0xc7, 0x71, 0x23, 0x42, 0x2d,
+    0xb3, 0xe4, 0x68, 0x31, 0xec, 0x17, 0x63, 0x27, 0xe3, 0x52, 0x9d, 0xd0,
+    0xcd, 0xd8, 0xd8, 0x86, 0xb4, 0x91, 0x8a, 0xa3, 0xcb, 0xa3, 0x76, 0xc7,
+    0x98, 0xda, 0xd6, 0xb8, 0x34, 0x1c, 0xf6, 0x72, 0x23, 0xd8, 0x1b, 0xbe,
+    0x2d, 0x05, 0xe1, 0x83, 0x01, 0x74, 0xc7, 0xe3, 0x54, 0x85, 0xec, 0xec,
+    0xfb, 0x3a, 0xa2, 0xf3, 0x21, 0x7a, 0x0b, 0x68, 0x91, 0x02, 0xd2, 0xa4,
+    0x40, 0x21, 0xef, 0x4f, 0xe5, 0x3d, 0x6d, 0x6e, 0xfb, 0xba, 0xb1, 0x90,
+    0x4f, 0x81, 0x07, 0x27, 0x5e, 0xa8, 0xab, 0xa8, 0x87, 0x38, 0x3c, 0xe5,
+    0x48, 0x29, 0x9e, 0x77, 0x4c, 0xb4, 0x9d, 0x91, 0x2d, 0x8a, 0x0a, 0x84,
+    0xdd, 0x93, 0x95, 0xdf, 0xd4, 0xa3, 0x8f, 0xb7, 0xaf, 0x07, 0xd3, 0x81,
+    0xbb, 0x0d, 0x89, 0x42, 0x92, 0x0b, 0x66, 0x39, 0x8b, 0x99, 0x36, 0x61,
+    0xbb, 0xe1, 0x05, 0xca, 0x68, 0xc8, 0x0f, 0xae, 0x9e, 0x7d, 0x75, 0x7f,
+    0x24, 0xef, 0xdc, 0x97, 0x8d, 0xb9, 0xa5, 0x7a, 0x3c, 0xc4, 0x49, 0x79,
+    0x47, 0x47, 0x61, 0x88, 0xaf, 0x96, 0x08, 0x11, 0x22, 0xff, 0xb7, 0x14,
+    0x12, 0x15, 0x14, 0x26, 0xa3, 0x03, 0x0e, 0xb2, 0xff, 0x57, 0x9e, 0xc0,
+    0x92, 0x4f, 0x4c, 0x69, 0xd4, 0xfe, 0xc1, 0x46, 0xc4, 0xe8, 0x64, 0x7f,
+    0x08, 0x38, 0x90, 0x15, 0x8f, 0xc2, 0xc8, 0xa8, 0x50, 0x7f, 0x74, 0x4a,
+    0xc3, 0x37, 0x52, 0x44, 0x25, 0x78, 0x19, 0x48, 0x00, 0xd1, 0x39, 0x43,
+    0x3a, 0x14, 0x72, 0x8c, 0x8e, 0xa2, 0xf8, 0x95, 0x1e, 0x56, 0x07, 0xdd,
+    0xcd, 0x89, 0xde, 0x71, 0xc3, 0x85, 0xc3, 0xcf, 0xe4, 0x6c, 0xf4, 0x43,
+    0x95, 0x49, 0x27, 0x25, 0x35, 0x1a, 0xb9, 0xf7, 0xc8, 0x20, 0xeb, 0x01,
+    0xbb, 0x49, 0x8d, 0xf4, 0xc0, 0x32, 0xbe, 0x74, 0x42, 0x07, 0x53, 0xd0,
+    0xf4, 0x4c, 0x79, 0xa8, 0xb7, 0xf9, 0x09, 0xfd, 0xeb, 0x02, 0x83, 0x26,
+    0x3b, 0x88, 0x1a, 0x41, 0x70, 0x95, 0x2f, 0x53, 0xc1, 0xc1, 0xa5, 0xbe,
+    0x23, 0x32, 0x8b, 0x48, 0xb8, 0xff, 0x4c, 0x6b, 0x6e, 0xbf, 0xd7, 0xe0,
+    0xf1, 0x3a, 0xfd, 0xd2, 0x1e, 0xa2, 0x11, 0x50, 0xa0, 0xfe, 0xd2, 0x3d,
+    0x20, 0xa6, 0x79, 0xdd, 0x32, 0xd2, 0x76, 0x44, 0xb6, 0x28, 0x2a, 0x13,
+    0x76, 0x4e, 0x57, 0x92, 0xa5, 0x01, 0x64, 0x30, 0x06, 0xf1, 0xba, 0x62,
+    0x5a, 0x59, 0xab, 0xf2, 0x15, 0xef, 0x3c, 0x24, 0x96, 0x14, 0x6f, 0xd4,
+    0x51, 0xee, 0x6d, 0xeb, 0x77, 0xad, 0xba, 0x03, 0xe0, 0xd2, 0x30, 0xbd,
+    0xbf, 0x06, 0x14, 0xa3, 0xad, 0xd7, 0x97, 0x20, 0x89, 0x63, 0x8f, 0x84,
+    0x0d, 0x87, 0x6d, 0x5b, 0xdf, 0x0c, 0x2d, 0x86, 0x77, 0x6b, 0x73, 0xd6,
+    0x34, 0x83, 0xe5, 0x15, 0x88, 0x3e, 0xbc, 0x4d, 0x2c, 0x96, 0xd1, 0x1a,
+    0x81, 0xf1, 0xb4, 0x6c, 0xaa, 0x52, 0x3a, 0x53, 0x52, 0xc6, 0x73, 0x1b,
+    0xe6, 0xaa, 0xd5, 0xc8, 0x91, 0xee, 0x72, 0xad, 0x66, 0x25, 0x61, 0xbd,
+    0xa7, 0x15, 0x46, 0x5d, 0x76, 0x4a, 0x47, 0x9b, 0x03, 0x44, 0xe5, 0x0c,
+    0xe8, 0x51, 0xca, 0x32, 0x3a, 0x8b, 0xe2, 0x54, 0x79, 0x4d, 0x51, 0x4e,
+    0xbb, 0x44, 0x2c, 0x30, 0xd1, 0xe6, 0xa1, 0xc9, 0x2c, 0x28, 0xdf, 0xa8,
+    0xa3, 0xdc, 0xdb, 0xd6, 0xef, 0x5b, 0x74, 0x07, 0xc1, 0xa4, 0x55, 0x37,
+    0xc6, 0xfc, 0xde, 0xf2, 0x35, 0xb3, 0xf2, 0x3f, 0xe8, 0x0c, 0xbe, 0x60,
+    0x72, 0x56, 0xde, 0x5f, 0x0d, 0xdd, 0x2e, 0x67, 0x63, 0x31, 0x23, 0xbc,
+    0xbe, 0x8d, 0x47, 0xdd, 0xa0, 0x38, 0xab, 0x04, 0xd7, 0xb7, 0x07, 0xf9,
+    0x5d, 0x5e, 0x27, 0xd0, 0x6e, 0xda, 0x01, 0xda, 0x8b, 0x3d, 0xe9, 0x89,
+    0xe4, 0xbb, 0xeb, 0x3d, 0xd2, 0xb1, 0x16, 0x16, 0xe6, 0x49, 0xb6, 0x28,
+    0x02, 0xc3, 0xd0, 0x57, 0x17, 0x4f, 0x2a, 0x9b, 0x42, 0x74, 0x1d, 0x38,
+    0xc4, 0x19, 0xdd, 0xad, 0xcf, 0x58, 0xd2, 0x0f, 0x94, 0x56, 0x20, 0xfa,
+    0xf1, 0x34, 0xb2, 0x5b, 0x44, 0x6a, 0x07, 0xc6, 0xd1, 0xb2, 0xa9, 0x48,
+    0xe9, 0x4d, 0x4b, 0x19, 0xcc, 0x6f, 0x9a, 0xab, 0x57, 0x22, 0x47, 0xb9,
+    0xca, 0xb5, 0x98, 0x88, 0x58, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea,
+    0x45, 0xad, 0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e,
+    0x6a, 0x9d, 0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d,
+    0xe5, 0x5b, 0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe,
+    0x79, 0xb0, 0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5,
+    0x76, 0x3c, 0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4,
+    0x4f, 0x56, 0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8,
+    0xd4, 0xe3, 0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00,
+    0xd6, 0x02, 0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27,
+    0x5e, 0x89, 0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d,
+    0xae, 0xc8, 0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6,
+    0x28, 0x45, 0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f,
+    0xdd, 0x0b, 0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc,
+    0xa8, 0xbc, 0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57,
+    0x84, 0xdd, 0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf,
+    0x2e, 0x14, 0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09,
+    0x68, 0xb7, 0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37,
+    0xc7, 0xb2, 0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8,
+    0x62, 0x13, 0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b,
+    0xcf, 0xef, 0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37,
+    0x5e, 0x78, 0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1,
+    0x79, 0x83, 0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f,
+    0x26, 0xd2, 0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3,
+    0xc3, 0x62, 0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a,
+    0x93, 0x4c, 0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8,
+    0x64, 0xfa, 0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea,
+    0x25, 0xa2, 0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1,
+    0x67, 0xdd, 0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4,
+    0x5a, 0xdc, 0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6,
+    0xa9, 0xdb, 0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde,
+    0x55, 0xb0, 0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7,
+    0x9b, 0x04, 0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57,
+    0x63, 0xc3, 0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44,
+    0xf5, 0x63, 0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d,
+    0x4e, 0x34, 0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d,
+    0x60, 0x2b, 0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75,
+    0xe8, 0x96, 0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda,
+    0xec, 0x84, 0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62,
+    0x84, 0x5d, 0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd,
+    0xd0, 0xb2, 0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca,
+    0x8b, 0xcf, 0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78,
+    0x4d, 0xde, 0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2,
+    0xe1, 0x4b, 0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96,
+    0x8b, 0x79, 0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c,
+    0x7b, 0x2c, 0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86,
+    0x21, 0x39, 0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc,
+    0xfe, 0xf8, 0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75,
+    0xe7, 0x87, 0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17,
+    0x98, 0x3f, 0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2,
+    0x6d, 0x23, 0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c,
+    0x36, 0x2b, 0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9,
+    0x34, 0xc0, 0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86,
+    0x4f, 0xaf, 0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2,
+    0x5a, 0x29, 0x37, 0xa6, 0x3f, 0x2a, 0x2f, 0x3c, 0xb0, 0x60, 0x8a, 0x16,
+    0x7d, 0xda, 0xe0, 0xd8, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea, 0x45,
+    0xad, 0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e, 0x6a,
+    0x9d, 0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d, 0xe5,
+    0x5b, 0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe, 0x79,
+    0xb0, 0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5, 0x76,
+    0x3c, 0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4, 0x4f,
+    0x56, 0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8, 0xd4,
+    0xe3, 0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00, 0xd6,
+    0x02, 0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27, 0x5e,
+    0x89, 0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d, 0xae,
+    0xc8, 0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6, 0x28,
+    0x45, 0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f, 0xdd,
+    0x0b, 0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc, 0xa8,
+    0xbc, 0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57, 0x84,
+    0xdd, 0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf, 0x2e,
+    0x14, 0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09, 0x68,
+    0xb7, 0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37, 0xc7,
+    0xb2, 0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8, 0x62,
+    0x13, 0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b, 0xcf,
+    0xef, 0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37, 0x5e,
+    0x78, 0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1, 0x79,
+    0x83, 0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f, 0x26,
+    0xd2, 0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3, 0xc3,
+    0x62, 0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a, 0x93,
+    0x4c, 0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8, 0x64,
+    0xfa, 0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea, 0x25,
+    0xa2, 0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1, 0x67,
+    0xdd, 0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4, 0x5a,
+    0xdc, 0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6, 0xa9,
+    0xdb, 0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde, 0x55,
+    0xb0, 0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7, 0x9b,
+    0x04, 0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57, 0x63,
+    0xc3, 0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44, 0xf5,
+    0x63, 0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d, 0x4e,
+    0x34, 0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d, 0x60,
+    0x2b, 0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75, 0xe8,
+    0x96, 0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda, 0xec,
+    0x84, 0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62, 0x84,
+    0x5d, 0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd, 0xd0,
+    0xb2, 0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca, 0x8b,
+    0xcf, 0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78, 0x4d,
+    0xde, 0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2, 0xe1,
+    0x4b, 0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96, 0x8b,
+    0x79, 0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c, 0x7b,
+    0x2c, 0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86, 0x21,
+    0x39, 0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc, 0xfe,
+    0xf8, 0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75, 0xe7,
+    0x87, 0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17, 0x98,
+    0x3f, 0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2, 0x6d,
+    0x23, 0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c, 0x36,
+    0x2b, 0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9, 0x34,
+    0xc0, 0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86, 0x4f,
+    0xaf, 0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2, 0x5a,
+    0x29, 0x37, 0xa6, 0x3f, 0x2a, 0x2f, 0x3c, 0xb0, 0x60, 0x8a, 0x16, 0x7d,
+    0xda, 0xe0, 0xd8, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea, 0x45, 0xad,
+    0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e, 0x6a, 0x9d,
+    0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d, 0xe5, 0x5b,
+    0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe, 0x79, 0xb0,
+    0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5, 0x76, 0x3c,
+    0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4, 0x4f, 0x56,
+    0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8, 0xd4, 0xe3,
+    0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00, 0xd6, 0x02,
+    0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27, 0x5e, 0x89,
+    0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d, 0xae, 0xc8,
+    0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6, 0x28, 0x45,
+    0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f, 0xdd, 0x0b,
+    0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc, 0xa8, 0xbc,
+    0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57, 0x84, 0xdd,
+    0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf, 0x2e, 0x14,
+    0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09, 0x68, 0xb7,
+    0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37, 0xc7, 0xb2,
+    0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8, 0x62, 0x13,
+    0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b, 0xcf, 0xef,
+    0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37, 0x5e, 0x78,
+    0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1, 0x79, 0x83,
+    0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f, 0x26, 0xd2,
+    0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3, 0xc3, 0x62,
+    0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a, 0x93, 0x4c,
+    0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8, 0x64, 0xfa,
+    0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea, 0x25, 0xa2,
+    0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1, 0x67, 0xdd,
+    0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4, 0x5a, 0xdc,
+    0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6, 0xa9, 0xdb,
+    0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde, 0x55, 0xb0,
+    0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7, 0x9b, 0x04,
+    0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57, 0x63, 0xc3,
+    0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44, 0xf5, 0x63,
+    0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d, 0x4e, 0x34,
+    0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d, 0x60, 0x2b,
+    0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75, 0xe8, 0x96,
+    0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda, 0xec, 0x84,
+    0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62, 0x84, 0x5d,
+    0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd, 0xd0, 0xb2,
+    0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca, 0x8b, 0xcf,
+    0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78, 0x4d, 0xde,
+    0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2, 0xe1, 0x4b,
+    0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96, 0x8b, 0x79,
+    0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c, 0x7b, 0x2c,
+    0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86, 0x21, 0x39,
+    0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc, 0xfe, 0xf8,
+    0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75, 0xe7, 0x87,
+    0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17, 0x98, 0x3f,
+    0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2, 0x6d, 0x23,
+    0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c, 0x36, 0x2b,
+    0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9, 0x34, 0xc0,
+    0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86, 0x4f, 0xaf,
+    0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2, 0x70,
+};
+static_assert(sizeof(kBytesTestReadSymbol14) == kNumBytesTestReadSymbol14, "");
+
+// The kBytesTestReadSymbol16[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][17] = {
+//   // pmf: 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16,
+//   // 1/16, 1/16, 1/16, 1/16, 1/16
+//   { 32768 - 2048, 32768 - 4096, 32768 - 6144, 32768 - 8192, 32768 - 10240,
+//     32768 - 12288, 32768 - 14336, 32768 - 16384, 32768 - 18432,
+//     32768 - 20480, 32768 - 22528, 32768 - 24576, 32768 - 26624,
+//     32768 - 28672, 32768 - 30720, 0, 0 },
+//   // pmf: 3/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+//   // 2/32, 2/32, 2/32, 2/32, 1/32
+//   { 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216, 32768 - 11264,
+//     32768 - 13312, 32768 - 15360, 32768 - 17408, 32768 - 19456,
+//     32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+//     32768 - 29696, 32768 - 31744, 0, 0 },
+//   // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+//   // 2/32, 2/32, 2/32, 2/32, 3/32
+//   { 32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+//     32768 - 11264, 32768 - 13312, 32768 - 15360, 32768 - 17408,
+//     32768 - 19456, 32768 - 21504, 32768 - 23552, 32768 - 25600,
+//     32768 - 27648, 32768 - 29696, 0, 0 },
+//   // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 3/32, 3/32, 2/32, 2/32,
+//   // 2/32, 2/32, 2/32, 2/32, 1/32
+//   { 32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+//     32768 - 11264, 32768 - 13312, 32768 - 16384, 32768 - 19456,
+//     32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+//     32768 - 29696, 32768 - 31744, 0, 0 },
+// };
+// constexpr int kSymbols[32][4] = { { 0, 8, 15, 7 },    //
+//                                   { 1, 9, 14, 6 },    //
+//                                   { 2, 10, 13, 5 },   //
+//                                   { 3, 11, 12, 4 },   //
+//                                   { 4, 12, 11, 3 },   //
+//                                   { 5, 13, 10, 2 },   //
+//                                   { 6, 14, 9, 1 },    //
+//                                   { 7, 15, 8, 0 },    //
+//                                   { 8, 0, 7, 15 },    //
+//                                   { 9, 1, 6, 14 },    //
+//                                   { 10, 2, 5, 13 },   //
+//                                   { 11, 3, 4, 12 },   //
+//                                   { 12, 4, 3, 11 },   //
+//                                   { 13, 5, 2, 10 },   //
+//                                   { 14, 6, 1, 9 },    //
+//                                   { 15, 7, 0, 8 },    //
+//                                   { 0, 0, 15, 13 },   //
+//                                   { 2, 1, 14, 11 },   //
+//                                   { 4, 3, 12, 9 },    //
+//                                   { 6, 5, 10, 7 },    //
+//                                   { 8, 7, 8, 5 },     //
+//                                   { 10, 9, 6, 3 },    //
+//                                   { 12, 11, 4, 1 },   //
+//                                   { 14, 13, 2, 14 },  //
+//                                   { 1, 0, 15, 12 },   //
+//                                   { 3, 2, 13, 10 },   //
+//                                   { 5, 4, 11, 8 },    //
+//                                   { 7, 6, 9, 6 },     //
+//                                   { 9, 8, 7, 4 },     //
+//                                   { 11, 10, 5, 2 },   //
+//                                   { 13, 12, 3, 8 },   //
+//                                   { 15, 14, 1, 7 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 48; ++i) {
+//   for (int j = 0; j < 32; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 16);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol16 = 3120;
+constexpr uint8_t kBytesTestReadSymbol16[] = {
+    0x09, 0x2c, 0xb8, 0x5a, 0xe4, 0xe6, 0xc6, 0x1f, 0x3e, 0xa7, 0x50, 0xbf,
+    0x19, 0x26, 0xbf, 0x20, 0xc3, 0xa2, 0x08, 0xdf, 0x44, 0xd9, 0x4d, 0x8c,
+    0xf7, 0xbf, 0x6b, 0x6d, 0x22, 0x97, 0x8e, 0xd7, 0x93, 0xad, 0x33, 0xe3,
+    0x7f, 0x5b, 0x71, 0x03, 0x6b, 0x4e, 0xbf, 0xf5, 0x38, 0xbe, 0xba, 0x6c,
+    0x0d, 0x28, 0xca, 0x74, 0x2d, 0x1d, 0x3f, 0x91, 0xad, 0x7e, 0x98, 0x5c,
+    0xa7, 0x39, 0x5e, 0x7c, 0x43, 0x2b, 0x88, 0xb2, 0x81, 0x91, 0xad, 0x62,
+    0x14, 0xc6, 0x0a, 0x81, 0x15, 0x1f, 0x4e, 0xd5, 0xc1, 0x5c, 0x43, 0x35,
+    0xc3, 0xe6, 0x3d, 0xaa, 0xc3, 0xb5, 0x95, 0x01, 0xbd, 0x2d, 0x21, 0x04,
+    0x14, 0x79, 0x7a, 0x02, 0x7e, 0xb8, 0x09, 0x20, 0x06, 0x82, 0xc8, 0x6f,
+    0x29, 0x2c, 0xb2, 0x9b, 0xe2, 0x8d, 0xf5, 0x56, 0xf5, 0x64, 0xf4, 0xd7,
+    0xfe, 0x24, 0x29, 0xb6, 0x35, 0x16, 0x08, 0x26, 0xc0, 0xf0, 0xfd, 0x33,
+    0x04, 0x6f, 0x70, 0x85, 0x3a, 0xac, 0x8f, 0xab, 0x48, 0xce, 0x04, 0xc1,
+    0x0a, 0x4c, 0xb6, 0xaa, 0x83, 0x39, 0xc1, 0xf6, 0x00, 0xb8, 0x56, 0x4e,
+    0xa2, 0xd1, 0x19, 0x70, 0x6a, 0x2b, 0x86, 0xef, 0xbd, 0x11, 0x27, 0x54,
+    0x52, 0x01, 0xa2, 0x3f, 0x53, 0x0e, 0x5b, 0x23, 0x3c, 0x90, 0x82, 0xaf,
+    0x9d, 0x79, 0xb5, 0x5e, 0x7e, 0x2e, 0x6e, 0xad, 0x3d, 0xe9, 0x3a, 0xff,
+    0xd7, 0x59, 0x40, 0xa3, 0x56, 0xa9, 0x5e, 0x52, 0xda, 0x04, 0x74, 0x09,
+    0x47, 0x7c, 0x6c, 0x4b, 0xad, 0x00, 0x8b, 0xbc, 0x33, 0x16, 0x49, 0xf6,
+    0xa5, 0x11, 0x8d, 0xb4, 0xbc, 0x28, 0xea, 0x1b, 0x34, 0x1e, 0xb7, 0x1e,
+    0xbf, 0x50, 0xe3, 0x60, 0xad, 0x41, 0xe0, 0x19, 0xfa, 0xa4, 0x23, 0x98,
+    0x48, 0x23, 0xad, 0xfa, 0xdb, 0x3c, 0x0a, 0x15, 0xeb, 0xf5, 0xf1, 0x43,
+    0xf2, 0xfd, 0x42, 0xf2, 0xd0, 0x3f, 0xa6, 0x3b, 0xc8, 0x81, 0x52, 0xba,
+    0xcf, 0x2d, 0xff, 0x2c, 0x24, 0x13, 0x62, 0x78, 0x01, 0xd8, 0xcb, 0xfc,
+    0xda, 0x70, 0x58, 0xad, 0xf1, 0xe6, 0x30, 0x47, 0x39, 0xc6, 0xf0, 0xbc,
+    0xe4, 0x89, 0x49, 0x46, 0x79, 0xde, 0xac, 0xde, 0xbd, 0x97, 0x18, 0x8f,
+    0x17, 0x07, 0xc1, 0xaf, 0xf8, 0xc1, 0x45, 0x95, 0x50, 0x36, 0x4d, 0x16,
+    0x35, 0x92, 0x2b, 0x5a, 0x71, 0x81, 0x59, 0xe5, 0x7f, 0xba, 0x10, 0xc9,
+    0x49, 0xd4, 0xeb, 0x64, 0x08, 0x54, 0x8b, 0xfa, 0xb3, 0xc8, 0x3a, 0xd7,
+    0xa6, 0xa9, 0xf2, 0xae, 0x04, 0xf8, 0x55, 0x5c, 0xff, 0x2d, 0x17, 0x53,
+    0x37, 0xc5, 0x36, 0xd8, 0x42, 0xd7, 0x47, 0xd8, 0x00, 0x99, 0x9c, 0x5d,
+    0x9f, 0x34, 0xc2, 0x09, 0x6b, 0x1a, 0xf3, 0x2f, 0xb0, 0xf8, 0x49, 0x54,
+    0x9d, 0x4b, 0xb8, 0xcf, 0xc5, 0x3b, 0x7f, 0x49, 0x9b, 0x40, 0xa9, 0xd3,
+    0x96, 0xe1, 0x6b, 0x87, 0x2d, 0x50, 0x76, 0x15, 0xd9, 0x9f, 0x87, 0x4f,
+    0x13, 0x26, 0xf2, 0xf8, 0xae, 0xd4, 0x63, 0x02, 0x0c, 0xcb, 0xe5, 0x63,
+    0x1c, 0x73, 0xdf, 0x57, 0x55, 0x16, 0x57, 0x3b, 0xfb, 0x9a, 0x06, 0x70,
+    0xfc, 0x9f, 0x29, 0x16, 0xec, 0x63, 0x34, 0x6f, 0x40, 0x1f, 0x54, 0x2a,
+    0xe7, 0x4a, 0x6f, 0xde, 0x86, 0xeb, 0x8c, 0x91, 0x3e, 0xfc, 0x6a, 0x48,
+    0xd1, 0x51, 0x33, 0xd7, 0xe1, 0x9d, 0xf8, 0x71, 0x21, 0x7b, 0x02, 0x38,
+    0x6a, 0xef, 0x30, 0x70, 0x38, 0x01, 0xc3, 0xef, 0x5d, 0x4f, 0xd3, 0x37,
+    0x2d, 0xe0, 0x4f, 0x4b, 0x72, 0xbc, 0xde, 0x9f, 0x32, 0x97, 0xe2, 0x55,
+    0x5e, 0x59, 0x5d, 0xa2, 0x9f, 0x5a, 0x04, 0x7c, 0x13, 0xe1, 0x35, 0x62,
+    0x4a, 0x10, 0x24, 0x55, 0x63, 0xb8, 0x8f, 0x66, 0xbc, 0x04, 0x08, 0x4e,
+    0xcc, 0xdc, 0x1f, 0x88, 0xc5, 0xcf, 0x8a, 0x7e, 0x24, 0x3e, 0x6f, 0x58,
+    0xcb, 0x44, 0x3c, 0x18, 0x64, 0xd9, 0x84, 0xa8, 0x1c, 0x0b, 0x20, 0xf4,
+    0x8b, 0x8b, 0x4b, 0xf8, 0x39, 0x8b, 0x01, 0x3a, 0x0b, 0x27, 0x67, 0xf8,
+    0x0f, 0xbd, 0xb3, 0x32, 0xce, 0xef, 0xbc, 0x8c, 0xa3, 0x31, 0xee, 0x0b,
+    0xdb, 0xc7, 0xc3, 0x43, 0x80, 0xe4, 0x7c, 0x9b, 0x89, 0xa4, 0x6b, 0x23,
+    0x2f, 0xa8, 0x28, 0xe0, 0x55, 0x30, 0x6e, 0xe7, 0xc9, 0x50, 0x1d, 0xbf,
+    0x67, 0xc8, 0x74, 0x58, 0x0f, 0xdb, 0xa6, 0x1f, 0xa6, 0xfd, 0xf0, 0x75,
+    0xea, 0x62, 0xd5, 0x44, 0xa2, 0x7e, 0xed, 0x63, 0xba, 0x7c, 0x5d, 0xb7,
+    0x16, 0x84, 0x30, 0x5d, 0xc2, 0xd3, 0x39, 0x61, 0x60, 0x0a, 0xb9, 0x34,
+    0x5e, 0x54, 0xf4, 0x34, 0x77, 0x22, 0x05, 0x41, 0x6b, 0x6a, 0x13, 0xc3,
+    0x10, 0x03, 0x8a, 0x78, 0xd2, 0x81, 0xac, 0x49, 0x31, 0xc8, 0xee, 0x15,
+    0xc3, 0x42, 0x3b, 0x00, 0xf6, 0x05, 0x92, 0x82, 0x6e, 0x73, 0xb4, 0xfa,
+    0xab, 0xe0, 0x2e, 0xe9, 0x5d, 0x89, 0x43, 0x0c, 0x4d, 0x88, 0x0c, 0xf1,
+    0xa4, 0x19, 0x59, 0xa0, 0x69, 0x0c, 0xfc, 0xf9, 0x9a, 0xbc, 0x3b, 0x2e,
+    0x3b, 0x29, 0xf8, 0xd7, 0x79, 0x11, 0xb2, 0x66, 0x26, 0x57, 0x34, 0x06,
+    0xb8, 0x36, 0x41, 0xca, 0x01, 0x10, 0xca, 0x06, 0xee, 0xb6, 0xf7, 0x1d,
+    0x0d, 0x88, 0xab, 0x07, 0xbe, 0x06, 0x8c, 0x1c, 0xa2, 0x76, 0x5e, 0xdb,
+    0x60, 0xa4, 0x43, 0x17, 0x31, 0xc3, 0x4b, 0x0a, 0x01, 0x80, 0xa7, 0xf6,
+    0xe6, 0x78, 0x64, 0x85, 0xb0, 0x8a, 0x28, 0x34, 0x82, 0x98, 0x29, 0x3f,
+    0xde, 0x07, 0x9a, 0x80, 0xcf, 0xe3, 0x6f, 0x23, 0x57, 0x79, 0x11, 0xb2,
+    0x61, 0x6d, 0x98, 0x26, 0xeb, 0x3b, 0xbf, 0xaa, 0x98, 0x62, 0xbb, 0xfd,
+    0x21, 0x76, 0xe5, 0xc5, 0xe0, 0x09, 0x21, 0x65, 0x72, 0x94, 0xd3, 0x8a,
+    0xcd, 0xfb, 0xec, 0x6e, 0x57, 0xd4, 0x2a, 0x92, 0xd1, 0xe9, 0x16, 0x46,
+    0xa2, 0x38, 0xae, 0x4b, 0x7e, 0xa7, 0x0c, 0x26, 0x9d, 0x96, 0xd7, 0x49,
+    0xa7, 0x02, 0x2b, 0x22, 0x9a, 0x39, 0x38, 0x11, 0xb8, 0xb3, 0xd5, 0x09,
+    0xf9, 0x70, 0xb4, 0x1c, 0x4e, 0xe3, 0xba, 0xa0, 0x78, 0x76, 0x6d, 0xc4,
+    0xab, 0x96, 0x3e, 0x98, 0x04, 0x4e, 0x50, 0x20, 0xd9, 0xfa, 0xea, 0xe2,
+    0x99, 0x50, 0x84, 0x20, 0x18, 0x69, 0xbb, 0x6e, 0x41, 0x9d, 0x18, 0x71,
+    0x15, 0x19, 0xd2, 0xf2, 0xa5, 0x69, 0x54, 0x8e, 0x60, 0x75, 0xd4, 0xe7,
+    0xdb, 0xe1, 0x43, 0xfd, 0x2e, 0x21, 0x4f, 0xff, 0x98, 0x8b, 0x08, 0x74,
+    0xca, 0x29, 0x7e, 0x3f, 0x2f, 0x6a, 0xf9, 0xe6, 0x49, 0x1d, 0xc6, 0x0b,
+    0x76, 0xc9, 0x22, 0xc3, 0x4f, 0xaf, 0xa8, 0xf9, 0xd6, 0x9c, 0x9a, 0x64,
+    0xec, 0xb3, 0x2c, 0x0f, 0x3e, 0x93, 0xc4, 0xb6, 0xd7, 0x36, 0x28, 0x04,
+    0xe5, 0x81, 0x48, 0x14, 0x9f, 0x4e, 0xc5, 0x9b, 0xd7, 0xc0, 0x0e, 0x35,
+    0xab, 0x49, 0xd3, 0x84, 0x9f, 0x5c, 0x93, 0x94, 0xa6, 0xd2, 0xb5, 0x83,
+    0x9d, 0x38, 0x0f, 0x85, 0x04, 0xa3, 0xb7, 0x23, 0x20, 0x93, 0x85, 0x48,
+    0x14, 0x0c, 0x22, 0x80, 0x92, 0x6c, 0xca, 0x3c, 0xc7, 0xfc, 0xa9, 0x88,
+    0x62, 0xbc, 0x2a, 0x91, 0x08, 0x5b, 0xb4, 0x60, 0xd1, 0x0f, 0x3c, 0x33,
+    0xc6, 0xe1, 0xf7, 0xca, 0xf7, 0xf9, 0xa1, 0x9b, 0xfa, 0xf7, 0x34, 0xe0,
+    0x54, 0xac, 0x53, 0x42, 0x30, 0x76, 0xc8, 0xc2, 0xcd, 0x61, 0x49, 0x87,
+    0x9c, 0x47, 0xf5, 0x98, 0xb5, 0x41, 0xf0, 0xad, 0xdb, 0x37, 0x06, 0xb8,
+    0x54, 0xa5, 0x26, 0x11, 0x4b, 0x18, 0xbb, 0xa4, 0xfb, 0x24, 0xd3, 0x14,
+    0x31, 0xfb, 0x56, 0x18, 0xd8, 0xc2, 0xd0, 0xd2, 0xab, 0xde, 0xdf, 0xa9,
+    0xdf, 0x9e, 0xa6, 0x56, 0x0d, 0x9f, 0xe4, 0x19, 0x15, 0x58, 0x18, 0xc6,
+    0x5e, 0x47, 0x05, 0x3a, 0x0e, 0x73, 0x68, 0x81, 0x39, 0x8c, 0x51, 0x1d,
+    0x04, 0x4e, 0x18, 0x54, 0xa5, 0x3e, 0x13, 0x4a, 0x15, 0xc2, 0x43, 0x90,
+    0xc2, 0x71, 0x8d, 0x53, 0x1b, 0xab, 0xe9, 0xbc, 0x69, 0x3e, 0x11, 0x46,
+    0x9d, 0xa4, 0xd3, 0x15, 0x80, 0xec, 0xe8, 0x31, 0x4f, 0x5a, 0x2a, 0x15,
+    0x3e, 0x7e, 0x7a, 0x44, 0x0e, 0x4a, 0xac, 0x9b, 0x46, 0x2f, 0x86, 0xf9,
+    0xea, 0x59, 0x4f, 0x15, 0xa0, 0x4b, 0xd1, 0xaa, 0xd8, 0x3a, 0x83, 0xb6,
+    0x25, 0x82, 0xb0, 0x44, 0x4a, 0x98, 0xbd, 0x10, 0xa2, 0xb0, 0x95, 0x02,
+    0xfa, 0x1f, 0xd3, 0x54, 0x1c, 0x0a, 0xb1, 0x31, 0x28, 0xec, 0x4c, 0xd2,
+    0x0c, 0xb9, 0xb0, 0xf4, 0x7a, 0x89, 0x63, 0x3c, 0x5f, 0xcf, 0x3c, 0xe8,
+    0xba, 0x21, 0x66, 0x20, 0x01, 0xcb, 0x1b, 0xc6, 0xf9, 0x54, 0x0f, 0xda,
+    0x4a, 0xcc, 0x81, 0x7b, 0x41, 0x81, 0xc0, 0x1f, 0xea, 0x9a, 0x9b, 0x96,
+    0x0d, 0x47, 0xdd, 0x16, 0x52, 0x5c, 0xaf, 0xae, 0x82, 0x3d, 0x18, 0x60,
+    0xfa, 0x34, 0xc2, 0x57, 0x2d, 0xc4, 0x2b, 0x2e, 0x41, 0xfe, 0xe7, 0x95,
+    0xcd, 0x1f, 0xbe, 0x88, 0x31, 0xc1, 0x07, 0x2c, 0xd3, 0xb1, 0xbb, 0xeb,
+    0x1d, 0xa3, 0x03, 0x1e, 0x70, 0xcc, 0x84, 0xe0, 0x65, 0x41, 0x0f, 0xf1,
+    0x7c, 0x95, 0x4b, 0x41, 0x43, 0x62, 0xad, 0x5d, 0xff, 0x4f, 0x92, 0xc8,
+    0xaa, 0x21, 0x23, 0xba, 0xa9, 0x90, 0xb5, 0xae, 0xc0, 0x1f, 0xae, 0x43,
+    0xf1, 0x79, 0x14, 0x30, 0x16, 0x1d, 0x2a, 0x6c, 0xd1, 0xd8, 0xb3, 0x38,
+    0x25, 0xd1, 0x66, 0xa5, 0x89, 0xc0, 0x8d, 0xc5, 0xa0, 0x6a, 0x7c, 0x64,
+    0xf8, 0x45, 0x1a, 0x76, 0x93, 0x4c, 0x56, 0x03, 0xb3, 0xa0, 0xc5, 0x40,
+    0xbc, 0x84, 0x98, 0x8d, 0xa4, 0xfe, 0x0b, 0x8c, 0x47, 0xa2, 0x88, 0x85,
+    0x2a, 0x89, 0xad, 0xd3, 0x16, 0x5b, 0x20, 0x02, 0x70, 0xbf, 0x72, 0x29,
+    0x0c, 0x0a, 0x9c, 0xac, 0x9c, 0x4d, 0xfa, 0x02, 0x5e, 0xe9, 0xe3, 0x52,
+    0x84, 0x54, 0x1f, 0xb7, 0xea, 0xb1, 0xc4, 0x2f, 0x69, 0xd1, 0x33, 0xc6,
+    0xb3, 0xee, 0xb0, 0x35, 0x1f, 0x19, 0x68, 0x2d, 0xef, 0xc1, 0xd3, 0x1c,
+    0xa8, 0x84, 0x54, 0x3c, 0x21, 0xed, 0x78, 0x35, 0x3f, 0x82, 0xb2, 0xa8,
+    0xe4, 0x25, 0x71, 0xfc, 0x1e, 0x1d, 0x36, 0xf4, 0xf4, 0x0f, 0x6f, 0x5b,
+    0xd9, 0x21, 0x13, 0x3a, 0x3d, 0x17, 0x45, 0x31, 0x78, 0x97, 0x99, 0x15,
+    0x87, 0xa9, 0xa6, 0x36, 0xf0, 0x20, 0xfa, 0xd5, 0x10, 0x01, 0x91, 0xa0,
+    0x4f, 0x28, 0x6a, 0x13, 0x04, 0xff, 0x97, 0x96, 0xf1, 0xfc, 0x1c, 0xc8,
+    0xcd, 0xe4, 0xbd, 0xe5, 0x40, 0x9a, 0x37, 0xc2, 0x01, 0x11, 0x2a, 0xc0,
+    0x0e, 0x58, 0x69, 0x29, 0xd0, 0x72, 0x26, 0x7c, 0x23, 0xec, 0x58, 0xfe,
+    0xbd, 0x15, 0x97, 0xe8, 0x29, 0x9f, 0x79, 0xb1, 0xfa, 0xac, 0x59, 0xe0,
+    0x78, 0x1c, 0xb4, 0x29, 0xee, 0x00, 0x39, 0x11, 0x0a, 0x2a, 0xb9, 0x98,
+    0x4e, 0xbf, 0x75, 0x9e, 0xe8, 0xbb, 0x4b, 0xe0, 0x6b, 0xab, 0x5b, 0x2f,
+    0x2d, 0xe3, 0xf8, 0x39, 0x91, 0x9b, 0xc9, 0x7b, 0xca, 0x81, 0x34, 0x6f,
+    0x84, 0x02, 0x22, 0x55, 0x80, 0x1c, 0xb0, 0xd2, 0x53, 0xa0, 0xe4, 0x4c,
+    0xf8, 0x47, 0xd8, 0xb1, 0xfd, 0x7a, 0x2b, 0x2f, 0xd0, 0x53, 0x3e, 0xf3,
+    0x63, 0xf5, 0x58, 0xb3, 0xc0, 0xf0, 0x39, 0x00, 0x08, 0x97, 0x4b, 0xe2,
+    0x46, 0x04, 0xa2, 0x39, 0x9c, 0xf2, 0x57, 0x17, 0x4a, 0xdd, 0x9f, 0x5e,
+    0xb1, 0x8b, 0x6b, 0x5d, 0x6e, 0x3e, 0x85, 0x34, 0x04, 0x96, 0x56, 0xe7,
+    0x4f, 0x6f, 0xd0, 0x31, 0xe7, 0x0c, 0xc8, 0x88, 0xdd, 0x5b, 0x14, 0x00,
+    0x60, 0x2a, 0x06, 0x18, 0xcd, 0x7f, 0xc9, 0xee, 0xd2, 0xd0, 0x8c, 0xc0,
+    0xed, 0x8f, 0x4a, 0x3e, 0x83, 0x52, 0x2e, 0x4a, 0xe9, 0xfa, 0x1f, 0x1a,
+    0xd5, 0xc0, 0x59, 0x4c, 0x8a, 0x2a, 0xab, 0x40, 0x2f, 0x84, 0xd2, 0x85,
+    0x70, 0x90, 0x96, 0xf3, 0x84, 0x6f, 0x1e, 0x81, 0x8c, 0x80, 0x03, 0x03,
+    0x2d, 0x36, 0x2e, 0x60, 0x79, 0x13, 0x63, 0x7f, 0xe7, 0xe3, 0x4a, 0x96,
+    0x08, 0xd8, 0x35, 0x15, 0x46, 0x8a, 0xe0, 0xb8, 0xc4, 0x7a, 0x28, 0x88,
+    0x52, 0xa8, 0x9a, 0xdd, 0x31, 0x65, 0xb2, 0x00, 0x24, 0xd9, 0xf4, 0x07,
+    0xea, 0xab, 0x7c, 0xe8, 0xa2, 0xea, 0xa7, 0x23, 0xd1, 0x93, 0x9e, 0xe7,
+    0x48, 0x34, 0x89, 0xf5, 0xb4, 0x45, 0x5e, 0xfa, 0xa6, 0xee, 0x32, 0x75,
+    0x8c, 0x56, 0x08, 0xcc, 0xeb, 0x5b, 0x05, 0xc2, 0x1d, 0x62, 0xa8, 0x5d,
+    0xaa, 0x50, 0xc2, 0x85, 0x85, 0x25, 0xb3, 0x5f, 0x60, 0xe7, 0x90, 0x1b,
+    0xa8, 0xb7, 0xf6, 0x83, 0x11, 0x07, 0x1f, 0xfc, 0xce, 0x58, 0x22, 0x8a,
+    0x3d, 0xa9, 0x8c, 0x18, 0x66, 0xa8, 0x32, 0x78, 0xa0, 0x16, 0x8a, 0xa2,
+    0x5d, 0x2f, 0x89, 0x18, 0x12, 0x88, 0xe6, 0x73, 0xc9, 0x5c, 0x5d, 0x2b,
+    0x76, 0x7d, 0x7a, 0xc6, 0x2d, 0xad, 0x75, 0xb8, 0xfa, 0x14, 0xd0, 0x12,
+    0x59, 0x5b, 0x9d, 0x3d, 0xbf, 0x40, 0xc7, 0x9c, 0x33, 0x22, 0x23, 0x75,
+    0x6c, 0x50, 0x01, 0x80, 0xa8, 0x18, 0x63, 0x35, 0xff, 0x27, 0xbb, 0x4b,
+    0x42, 0x33, 0x03, 0xb6, 0x3d, 0x28, 0xfa, 0x0d, 0x48, 0xb9, 0x2b, 0xa7,
+    0xe8, 0x7c, 0x6b, 0x57, 0x01, 0x65, 0x32, 0x28, 0xaa, 0xad, 0x00, 0xbe,
+    0x13, 0x4a, 0x15, 0xc2, 0x42, 0x5b, 0xce, 0x11, 0xbc, 0x7a, 0x06, 0x32,
+    0x00, 0x0c, 0x0c, 0xb4, 0xd8, 0xb9, 0x81, 0xe4, 0x4d, 0x8d, 0xff, 0x9f,
+    0x8d, 0x2a, 0x58, 0x23, 0x60, 0xd4, 0x55, 0x1a, 0x2b, 0x82, 0xe3, 0x11,
+    0xe8, 0xa2, 0x21, 0x4a, 0xa2, 0x6b, 0x74, 0xc5, 0x96, 0xc8, 0x00, 0x93,
+    0x67, 0xd0, 0x1f, 0xaa, 0xad, 0xf3, 0xa2, 0x8b, 0xaa, 0x9c, 0x8f, 0x46,
+    0x4e, 0x7b, 0x9d, 0x20, 0xd2, 0x27, 0xd6, 0xd1, 0x15, 0x7b, 0xea, 0x9b,
+    0xb8, 0xc9, 0xd6, 0x31, 0x58, 0x23, 0x33, 0xad, 0x6c, 0x17, 0x08, 0x75,
+    0x8a, 0xa1, 0x76, 0xa9, 0x43, 0x0a, 0x16, 0x14, 0x96, 0xcd, 0x7d, 0x83,
+    0x9e, 0x40, 0x6e, 0xa2, 0xdf, 0xda, 0x0c, 0x44, 0x1c, 0x7f, 0xf3, 0x39,
+    0x60, 0x8a, 0x28, 0xf6, 0xa6, 0x30, 0x61, 0x9a, 0xa0, 0xc9, 0xe2, 0x80,
+    0x5a, 0x2a, 0x89, 0x74, 0xbe, 0x24, 0x60, 0x4a, 0x23, 0x99, 0xcf, 0x25,
+    0x71, 0x74, 0xad, 0xd9, 0xf5, 0xeb, 0x18, 0xb6, 0xb5, 0xd6, 0xe3, 0xe8,
+    0x53, 0x40, 0x49, 0x65, 0x6e, 0x74, 0xf6, 0xfd, 0x03, 0x1e, 0x70, 0xcc,
+    0x88, 0x8d, 0xd5, 0xb1, 0x40, 0x06, 0x02, 0xa0, 0x61, 0x8c, 0xd7, 0xfc,
+    0x9e, 0xed, 0x2d, 0x08, 0xcc, 0x0e, 0xd8, 0xf4, 0xa3, 0xe9, 0x41, 0x30,
+    0x05, 0xc8, 0xbd, 0x3c, 0xa4, 0xb7, 0x09, 0x6f, 0x9c, 0xc8, 0xa2, 0xaa,
+    0xb4, 0x02, 0xf8, 0x4d, 0x28, 0x57, 0x09, 0x09, 0x6f, 0x38, 0x46, 0xf1,
+    0xe8, 0x18, 0xc8, 0x00, 0x30, 0x32, 0xd3, 0x62, 0xe6, 0x07, 0x91, 0x36,
+    0x37, 0xfe, 0x7e, 0x34, 0xa9, 0x60, 0x8d, 0x83, 0x51, 0x54, 0x68, 0xae,
+    0x0b, 0x8c, 0x47, 0xa2, 0x88, 0x85, 0x2a, 0x89, 0xad, 0xd3, 0x16, 0x5b,
+    0x20, 0x02, 0x4f, 0xc0, 0x04, 0x8e, 0x38, 0xde, 0xd8, 0x95, 0xfc, 0x97,
+    0xd9, 0xd2, 0x15, 0xdb, 0x1a, 0xcc, 0x69, 0x02, 0xad, 0x4a, 0x5a, 0x70,
+    0x8b, 0xbf, 0xfc, 0x35, 0x6d, 0x3a, 0x0f, 0xc9, 0xea, 0x78, 0x1a, 0xd1,
+    0xcb, 0xb7, 0xaa, 0xb8, 0xf2, 0x44, 0xdf, 0xb3, 0xfe, 0x24, 0x83, 0xb9,
+    0x53, 0x94, 0x7e, 0xa5, 0xc5, 0x3f, 0xa2, 0x31, 0x3d, 0xdc, 0x0b, 0xb1,
+    0x24, 0x2f, 0x99, 0x4a, 0xd4, 0x0e, 0x6b, 0x3a, 0x34, 0x31, 0xc5, 0x87,
+    0x68, 0xbd, 0x61, 0xbd, 0xe2, 0xa0, 0xdb, 0x9a, 0x33, 0xfd, 0xc5, 0x10,
+    0x3f, 0xfb, 0xeb, 0xbd, 0x29, 0x03, 0x85, 0x8d, 0x08, 0x7b, 0xb6, 0xf7,
+    0xf0, 0xf5, 0x13, 0x69, 0x3e, 0x35, 0x68, 0x58, 0x50, 0xdb, 0x50, 0x13,
+    0x02, 0x3e, 0x81, 0x4b, 0x44, 0x6c, 0x75, 0x02, 0xe6, 0x90, 0x75, 0x6c,
+    0xc6, 0x7c, 0x23, 0xec, 0x58, 0xfe, 0xbd, 0x15, 0x97, 0xe8, 0x29, 0x9f,
+    0x80, 0x54, 0x65, 0xb8, 0x3c, 0x40, 0xe6, 0xdb, 0xbe, 0x51, 0x73, 0xe5,
+    0xf1, 0x23, 0x02, 0x51, 0x1c, 0xce, 0x79, 0x2b, 0x8b, 0xa5, 0x6e, 0xcf,
+    0xaf, 0x58, 0xc5, 0xb5, 0xae, 0xb7, 0x1f, 0x42, 0x9a, 0x02, 0x4b, 0x2b,
+    0x73, 0xa7, 0xb7, 0xe8, 0x18, 0xf3, 0x86, 0x64, 0x44, 0x6e, 0xad, 0x8a,
+    0x00, 0x30, 0x15, 0x03, 0x0c, 0x66, 0xbf, 0xe4, 0xf7, 0x69, 0x68, 0x46,
+    0x60, 0x76, 0xc7, 0xa5, 0x1f, 0x4a, 0x09, 0x80, 0x2e, 0x45, 0xe9, 0xe5,
+    0x25, 0xb8, 0x4b, 0x7c, 0xe6, 0x45, 0x15, 0x55, 0xa0, 0x17, 0xc2, 0x69,
+    0x42, 0xb8, 0x48, 0x4b, 0x79, 0xc2, 0x37, 0x8f, 0x40, 0xc6, 0x40, 0x01,
+    0x81, 0x96, 0x9b, 0x17, 0x30, 0x3c, 0x89, 0xb1, 0xbf, 0xf3, 0xf1, 0xa5,
+    0x4b, 0x04, 0x6c, 0x1a, 0x8a, 0xa3, 0x45, 0x70, 0x5c, 0x62, 0x3d, 0x14,
+    0x44, 0x29, 0x54, 0x4d, 0x6e, 0x98, 0xb2, 0xd9, 0x00, 0x12, 0x7e, 0x00,
+    0x24, 0x71, 0xc6, 0xf6, 0xc4, 0xaf, 0xe4, 0xbe, 0xce, 0x90, 0xae, 0xd8,
+    0xd6, 0x63, 0x48, 0x15, 0x6a, 0x52, 0xd3, 0x84, 0x5d, 0xff, 0xe1, 0xab,
+    0x69, 0xd0, 0x7e, 0x4f, 0x53, 0xc0, 0xd6, 0x8e, 0x5d, 0xbd, 0x55, 0xc7,
+    0x92, 0x26, 0xfd, 0x9f, 0xf1, 0x24, 0x1d, 0xca, 0x9c, 0xa3, 0xf5, 0x2e,
+    0x29, 0xfd, 0x11, 0x89, 0xee, 0xe0, 0x5d, 0x89, 0x21, 0x7c, 0xca, 0x56,
+    0xa0, 0x73, 0x59, 0xd1, 0xa1, 0x8e, 0x2c, 0x3b, 0x45, 0xeb, 0x0d, 0xef,
+    0x15, 0x06, 0xdc, 0xd1, 0x9f, 0xee, 0x28, 0x81, 0xff, 0xdf, 0x5d, 0xe9,
+    0x48, 0x1c, 0x2c, 0x68, 0x43, 0xdd, 0xb7, 0xbf, 0x87, 0xa8, 0x9b, 0x49,
+    0xf1, 0xab, 0x42, 0xc2, 0x86, 0xda, 0x80, 0x98, 0x11, 0xf4, 0x0a, 0x5a,
+    0x23, 0x63, 0xa8, 0x17, 0x34, 0x83, 0xab, 0x66, 0x33, 0xe1, 0x1f, 0x62,
+    0xc7, 0xf5, 0xe8, 0xac, 0xbf, 0x41, 0x4c, 0xfc, 0x02, 0xa3, 0x2d, 0xc1,
+    0xe2, 0x07, 0x36, 0xdd, 0xf2, 0x8b, 0x9f, 0x2f, 0x89, 0x18, 0x12, 0x88,
+    0xe6, 0x73, 0xc9, 0x5c, 0x5d, 0x2b, 0x76, 0x7d, 0x7a, 0xc6, 0x2d, 0xad,
+    0x75, 0xb8, 0xfa, 0x14, 0xd0, 0x12, 0x59, 0x5b, 0x9d, 0x3d, 0xbf, 0x40,
+    0xc7, 0x9c, 0x33, 0x22, 0x23, 0x75, 0x6c, 0x50, 0x01, 0x80, 0xa8, 0x83,
+    0x06, 0xd4, 0xd6, 0x8d, 0x36, 0x78, 0xf9, 0x03, 0x23, 0xdb, 0x17, 0x90,
+    0x52, 0x0c, 0x5f, 0x1b, 0xe6, 0x44, 0x79, 0x52, 0xc5, 0x50, 0x17, 0x81,
+    0xf3, 0x1b, 0x88, 0xba, 0xfd, 0xbd, 0xa5, 0x51, 0x65, 0x6d, 0x33, 0x96,
+    0xc2, 0x71, 0x8d, 0x53, 0x1b, 0xab, 0xe9, 0xb9, 0xd0, 0x45, 0x61, 0xaf,
+    0xf9, 0xb7, 0x38, 0x55, 0x4f, 0xe9, 0x85, 0x1d, 0x4c, 0x0e, 0x40, 0x77,
+    0x03, 0xbc, 0x09, 0xd0, 0x37, 0xe3, 0xde, 0xf1, 0x0c, 0xa6, 0xc8, 0xd5,
+    0x63, 0x01, 0xfd, 0xe7, 0xc0, 0x9a, 0xe0, 0x98, 0x02, 0xe4, 0x5e, 0x9e,
+    0x52, 0x5b, 0x84, 0xb7, 0xce, 0x64, 0x51, 0x55, 0x5a, 0x01, 0x7c, 0x26,
+    0x94, 0x2b, 0x84, 0x84, 0xb7, 0x9c, 0x23, 0x78, 0xf4, 0x0c, 0x64, 0x00,
+    0x18, 0x19, 0x69, 0xb1, 0x73, 0x03, 0xc8, 0x9b, 0x1b, 0xff, 0x3f, 0x1a,
+    0x54, 0xb0, 0x46, 0xc1, 0xa8, 0xaa, 0x34, 0x57, 0x07, 0x13, 0xd3, 0x43,
+    0xb1, 0xaa, 0x4b, 0xc4, 0xcb, 0x5a, 0x9b, 0xa2, 0x23, 0x98, 0xa2, 0xd3,
+    0x2b, 0x8c, 0x7b, 0xf8, 0xc7, 0xaa, 0xf6, 0xcc, 0xb8, 0xfc, 0xb5, 0x77,
+    0xce, 0xff, 0x9d, 0x0e, 0xdb, 0x2b, 0x03, 0xc7, 0x42, 0x86, 0xf1, 0xcb,
+    0xa2, 0xa7, 0x85, 0x77, 0x58, 0x1a, 0x8f, 0x8c, 0xb4, 0x16, 0xf7, 0xe0,
+    0xe9, 0x8e, 0x54, 0x42, 0x2a, 0x1e, 0x10, 0xf6, 0xbc, 0x1a, 0x9f, 0xa1,
+    0xcb, 0xff, 0x13, 0x06, 0x88, 0x6b, 0xb1, 0xeb, 0x37, 0x26, 0xe5, 0x34,
+    0x0d, 0x73, 0x87, 0x91, 0x60, 0x6c, 0xd7, 0x2d, 0xc3, 0x5f, 0x40, 0x68,
+    0x45, 0x07, 0x6e, 0x62, 0xa9, 0xe3, 0x52, 0x75, 0xef, 0x14, 0xf5, 0x89,
+    0x0a, 0x3a, 0x57, 0x8b, 0xac, 0xbe, 0x86, 0x67, 0xd1, 0xd8, 0x35, 0xe5,
+    0xe7, 0x75, 0xb8, 0xf8, 0x28, 0x6d, 0xa8, 0x09, 0x81, 0x1f, 0x40, 0xa5,
+    0xa2, 0x36, 0x3a, 0x81, 0x73, 0x48, 0x3e, 0x8c, 0x9d, 0x1f, 0x78, 0xc5,
+    0x92, 0x36, 0x1a, 0xae, 0xdf, 0xda, 0xf8, 0x0a, 0x7e, 0x69, 0xcb, 0xaf,
+    0x74, 0x59, 0x49, 0x72, 0xa7, 0x97, 0x1c, 0x8c, 0xf0, 0x16, 0x01, 0x4a,
+    0xcc, 0x1a, 0xa1, 0x24, 0x83, 0x7b, 0x34, 0x65, 0x20, 0x51, 0x11, 0xae,
+    0x5d, 0xa7, 0x68, 0x9c, 0xec, 0x29, 0x27, 0xfc, 0x07, 0x49, 0xb4, 0x9b,
+    0x65, 0xb2, 0x51, 0x97, 0xae, 0xa5, 0x8a, 0x70, 0xe5, 0x53, 0xd3, 0xa2,
+    0x34, 0x35, 0xbd, 0xbf, 0x75, 0x64, 0xda, 0x88, 0x8c, 0xe9, 0xc3, 0x9a,
+    0x32, 0xf0, 0x5a, 0x96, 0xae, 0xef, 0x9a, 0xdd, 0x84, 0xc2, 0x97, 0x22,
+    0x2f, 0x06, 0x83, 0x32, 0x10, 0xff, 0x1d, 0x61, 0x60, 0x5f, 0x69, 0x10,
+    0x5d, 0x23, 0xc6, 0xf3, 0x3f, 0xa9, 0x53, 0xfe, 0xd0, 0x3e, 0x90, 0xe6,
+    0x54, 0x48, 0xab, 0x01, 0x76, 0x75, 0x88, 0x7b, 0x4e, 0xc6, 0xd0, 0x9b,
+    0x7a, 0xcd, 0x87, 0x36, 0x3e, 0x7e, 0x3d, 0xef, 0x10, 0xca, 0x6c, 0x8d,
+    0x56, 0x30, 0x1f, 0xde, 0x7c, 0x09, 0xae, 0x09, 0x80, 0x2e, 0x45, 0xe9,
+    0xe5, 0x25, 0xb8, 0x4b, 0x7c, 0xe6, 0x45, 0x15, 0x55, 0xa0, 0x17, 0xc2,
+    0x69, 0x42, 0xb8, 0x48, 0x4b, 0x79, 0xc2, 0x37, 0x8f, 0x40, 0xc6, 0x40,
+    0x01, 0x81, 0x96, 0x9b, 0x17, 0x30, 0x3c, 0x89, 0xb1, 0xbf, 0xf3, 0xf1,
+    0xa5, 0x5c, 0xdc, 0x1e, 0x69, 0xfc, 0xf1, 0xd8, 0x5d, 0xda, 0x13, 0x5b,
+    0xbc, 0x1f, 0x41, 0x4a, 0xde, 0x44, 0x3c, 0x5e, 0xbd, 0x46, 0xb7, 0xad,
+    0x32, 0xb8, 0xc7, 0xbf, 0x8c, 0x7a, 0xaf, 0x6c, 0xcb, 0x8f, 0xcb, 0x57,
+    0x7c, 0xef, 0xf9, 0xd0, 0xed, 0xb2, 0xb0, 0x3c, 0x74, 0x28, 0x6f, 0x1c,
+    0xba, 0x2a, 0x78, 0x57, 0x75, 0x81, 0xa8, 0xf8, 0xcb, 0x41, 0x6f, 0x7e,
+    0x0e, 0x98, 0xe5, 0x44, 0x22, 0xa2, 0x00, 0x6c, 0xba, 0xaf, 0x51, 0xcc,
+    0x9f, 0xba, 0x97, 0x39, 0xbb, 0x41, 0x60, 0xf0, 0xe9, 0xb7, 0xa7, 0xa0,
+    0x7b, 0x7a, 0xde, 0xc9, 0x22, 0x13, 0xf4, 0x04, 0xaf, 0x91, 0xf5, 0x37,
+    0x53, 0xad, 0x8d, 0x0d, 0x15, 0x7a, 0xf1, 0x81, 0x07, 0xd6, 0xa8, 0x80,
+    0x0c, 0x8d, 0x02, 0x79, 0x43, 0x50, 0x98, 0x27, 0xfc, 0xbc, 0xb7, 0x8f,
+    0xe0, 0xe6, 0x46, 0x6f, 0x25, 0xef, 0x2a, 0x04, 0xd1, 0xbe, 0x10, 0x3d,
+    0xb4, 0x43, 0x3e, 0xf7, 0xea, 0xf4, 0xb8, 0x24, 0xdc, 0x77, 0x4f, 0x52,
+    0x26, 0x55, 0xae, 0xbc, 0x6f, 0xe0, 0x8e, 0x41, 0x97, 0x82, 0xd4, 0xb5,
+    0x77, 0x7c, 0xd6, 0xec, 0x26, 0x14, 0xb9, 0x11, 0x78, 0x34, 0x19, 0x90,
+    0x87, 0xf8, 0xeb, 0x0b, 0x02, 0xfb, 0x48, 0x82, 0xe9, 0x1e, 0x37, 0x99,
+    0xfd, 0x4a, 0x9f, 0xf6, 0x81, 0xf4, 0x87, 0x32, 0xa2, 0x45, 0x58, 0x0b,
+    0xb3, 0xac, 0x43, 0xda, 0x76, 0x36, 0x84, 0xdb, 0xd6, 0x6c, 0x39, 0xb1,
+    0xf3, 0xf1, 0xef, 0x78, 0x86, 0x53, 0x64, 0x6a, 0xb1, 0x80, 0xfe, 0xf3,
+    0xe0, 0x4d, 0x70, 0x4c, 0x01, 0x72, 0x2f, 0x4f, 0x29, 0x2d, 0xc2, 0x5c,
+};
+static_assert(sizeof(kBytesTestReadSymbol16) == kNumBytesTestReadSymbol16, "");
diff --git a/src/utils/executor.cc b/src/utils/executor.cc
new file mode 100644 (file)
index 0000000..6934057
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+
+Executor::~Executor() = default;
+
+}  // namespace libgav1
diff --git a/src/utils/executor.h b/src/utils/executor.h
new file mode 100644 (file)
index 0000000..21abdf8
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_EXECUTOR_H_
+#define LIBGAV1_SRC_UTILS_EXECUTOR_H_
+
+#include <functional>
+
+namespace libgav1 {
+
+class Executor {
+ public:
+  virtual ~Executor();
+
+  // Schedules the specified "callback" for execution in this executor.
+  // Depending on the subclass implementation, this may block in some
+  // situations.
+  virtual void Schedule(std::function<void()> callback) = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_EXECUTOR_H_
diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake
new file mode 100644 (file)
index 0000000..587ca5d
--- /dev/null
@@ -0,0 +1,70 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_)
+  return()
+endif() # LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_
+set(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ 1)
+
+list(APPEND libgav1_utils_sources
+            "${libgav1_source}/utils/array_2d.h"
+            "${libgav1_source}/utils/bit_mask_set.h"
+            "${libgav1_source}/utils/bit_reader.cc"
+            "${libgav1_source}/utils/bit_reader.h"
+            "${libgav1_source}/utils/block_parameters_holder.cc"
+            "${libgav1_source}/utils/block_parameters_holder.h"
+            "${libgav1_source}/utils/blocking_counter.h"
+            "${libgav1_source}/utils/common.h"
+            "${libgav1_source}/utils/compiler_attributes.h"
+            "${libgav1_source}/utils/constants.cc"
+            "${libgav1_source}/utils/constants.h"
+            "${libgav1_source}/utils/cpu.cc"
+            "${libgav1_source}/utils/cpu.h"
+            "${libgav1_source}/utils/dynamic_buffer.h"
+            "${libgav1_source}/utils/entropy_decoder.cc"
+            "${libgav1_source}/utils/entropy_decoder.h"
+            "${libgav1_source}/utils/executor.cc"
+            "${libgav1_source}/utils/executor.h"
+            "${libgav1_source}/utils/logging.cc"
+            "${libgav1_source}/utils/logging.h"
+            "${libgav1_source}/utils/memory.h"
+            "${libgav1_source}/utils/queue.h"
+            "${libgav1_source}/utils/raw_bit_reader.cc"
+            "${libgav1_source}/utils/raw_bit_reader.h"
+            "${libgav1_source}/utils/reference_info.h"
+            "${libgav1_source}/utils/segmentation.cc"
+            "${libgav1_source}/utils/segmentation.h"
+            "${libgav1_source}/utils/segmentation_map.cc"
+            "${libgav1_source}/utils/segmentation_map.h"
+            "${libgav1_source}/utils/stack.h"
+            "${libgav1_source}/utils/threadpool.cc"
+            "${libgav1_source}/utils/threadpool.h"
+            "${libgav1_source}/utils/types.h"
+            "${libgav1_source}/utils/unbounded_queue.h"
+            "${libgav1_source}/utils/vector.h")
+
+macro(libgav1_add_utils_targets)
+  libgav1_add_library(NAME
+                      libgav1_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_include_paths}
+                      ${libgav1_gtest_include_paths})
+
+endmacro()
diff --git a/src/utils/logging.cc b/src/utils/logging.cc
new file mode 100644 (file)
index 0000000..26e3e15
--- /dev/null
@@ -0,0 +1,65 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/logging.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+#include <thread>  // NOLINT (unapproved c++11 header)
+
+#if !defined(LIBGAV1_LOG_LEVEL)
+#define LIBGAV1_LOG_LEVEL (1 << 30)
+#endif
+
+namespace libgav1 {
+namespace internal {
+#if LIBGAV1_ENABLE_LOGGING
+namespace {
+
+const char* LogSeverityName(LogSeverity severity) {
+  switch (severity) {
+    case LogSeverity::kInfo:
+      return "INFO";
+    case LogSeverity::kError:
+      return "ERROR";
+    case LogSeverity::kWarning:
+      return "WARNING";
+  }
+  return "UNKNOWN";
+}
+
+}  // namespace
+
+void Log(LogSeverity severity, const char* file, int line, const char* format,
+         ...) {
+  if (LIBGAV1_LOG_LEVEL < static_cast<int>(severity)) return;
+  std::ostringstream ss;
+  ss << std::hex << std::this_thread::get_id();
+  fprintf(stderr, "%s %s %s:%d] ", LogSeverityName(severity), ss.str().c_str(),
+          file, line);
+
+  va_list ap;
+  va_start(ap, format);
+  vfprintf(stderr, format, ap);
+  va_end(ap);
+  fprintf(stderr, "\n");
+}
+#else   // !LIBGAV1_ENABLE_LOGGING
+void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
+         const char* /*format*/, ...) {}
+#endif  // LIBGAV1_ENABLE_LOGGING
+
+}  // namespace internal
+}  // namespace libgav1
diff --git a/src/utils/logging.h b/src/utils/logging.h
new file mode 100644 (file)
index 0000000..473aebd
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_LOGGING_H_
+#define LIBGAV1_SRC_UTILS_LOGGING_H_
+
+#include <cstddef>
+
+#include "src/utils/compiler_attributes.h"
+
+#if !defined(LIBGAV1_ENABLE_LOGGING)
+#if defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_ENABLE_LOGGING
+// LIBGAV1_DLOG(severity, printf-format-string)
+// Debug logging that can optionally be enabled in release builds by explicitly
+// setting LIBGAV1_ENABLE_LOGGING.
+// Severity is given as an all-caps version of enum LogSeverity with the
+// leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
+#define LIBGAV1_DLOG(severity, ...)                                     \
+  do {                                                                  \
+    constexpr const char* libgav1_logging_internal_basename =           \
+        libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
+    libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity,         \
+                           libgav1_logging_internal_basename, __LINE__, \
+                           __VA_ARGS__);                                \
+  } while (0)
+#else
+#define LIBGAV1_DLOG(severity, ...) \
+  do {                              \
+  } while (0)
+#endif  // LIBGAV1_ENABLE_LOGGING
+
+#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_WARNING \
+  libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo
+
+namespace libgav1 {
+namespace internal {
+
+enum class LogSeverity : int {
+  kError,
+  kWarning,
+  kInfo,
+};
+
+// Helper function to implement LIBGAV1_DLOG
+// Logs |format, ...| at |severity| level, reporting it as called from
+// |file|:|line|.
+void Log(libgav1::internal::LogSeverity severity, const char* file, int line,
+         const char* format, ...) LIBGAV1_PRINTF_ATTRIBUTE(4, 5);
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+  return (offset == 0 || file_name[offset - 1] == '/' ||
+          file_name[offset - 1] == '\\')
+             ? file_name + offset
+             : Basename(file_name, offset - 1);
+}
+
+}  // namespace internal
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_LOGGING_H_
diff --git a/src/utils/memory.h b/src/utils/memory.h
new file mode 100644 (file)
index 0000000..d1762a2
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_MEMORY_H_
+#define LIBGAV1_SRC_UTILS_MEMORY_H_
+
+#if defined(__ANDROID__) || defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h>
+#endif
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+
+namespace libgav1 {
+
+enum {
+// The byte alignment required for buffers used with SIMD code to be read or
+// written with aligned operations.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+    defined(_M_X64)
+  kMaxAlignment = 32,  // extended alignment is safe on x86.
+#else
+  kMaxAlignment = alignof(max_align_t),
+#endif
+};
+
+// AlignedAlloc, AlignedFree
+//
+// void* AlignedAlloc(size_t alignment, size_t size);
+//   Allocate aligned memory.
+//   |alignment| must be a power of 2.
+//   Unlike posix_memalign(), |alignment| may be smaller than sizeof(void*).
+//   Unlike aligned_alloc(), |size| does not need to be a multiple of
+//   |alignment|.
+//   The returned pointer should be freed by AlignedFree().
+//
+// void AlignedFree(void* aligned_memory);
+//   Free aligned memory.
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+  return _aligned_malloc(size, alignment);
+}
+
+inline void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+#else  // !(defined(_MSC_VER) || defined(__MINGW32__))
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+#if defined(__ANDROID__)
+  // Although posix_memalign() was introduced in Android API level 17, it is
+  // more convenient to use memalign(). Unlike glibc, Android does not consider
+  // memalign() an obsolete function.
+  return memalign(alignment, size);
+#else   // !defined(__ANDROID__)
+  void* ptr = nullptr;
+  // posix_memalign requires that the requested alignment be at least
+  // sizeof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  const size_t required_alignment = sizeof(void*);
+  if (alignment < required_alignment) return malloc(size);
+  const int error = posix_memalign(&ptr, alignment, size);
+  if (error != 0) {
+    errno = error;
+    return nullptr;
+  }
+  return ptr;
+#endif  // defined(__ANDROID__)
+}
+
+inline void AlignedFree(void* aligned_memory) { free(aligned_memory); }
+
+#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+
+inline void Memset(uint8_t* const dst, int value, size_t count) {
+  memset(dst, value, count);
+}
+
+inline void Memset(uint16_t* const dst, int value, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dst[i] = static_cast<uint16_t>(value);
+  }
+}
+
+inline void Memset(int16_t* const dst, int value, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dst[i] = static_cast<int16_t>(value);
+  }
+}
+
+struct MallocDeleter {
+  void operator()(void* ptr) const { free(ptr); }
+};
+
+struct AlignedDeleter {
+  void operator()(void* ptr) const { AlignedFree(ptr); }
+};
+
+template <typename T>
+using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
+
+// Allocates aligned memory for an array of |count| elements of type T.
+template <typename T>
+inline AlignedUniquePtr<T> MakeAlignedUniquePtr(size_t alignment,
+                                                size_t count) {
+  return AlignedUniquePtr<T>(
+      static_cast<T*>(AlignedAlloc(alignment, count * sizeof(T))));
+}
+
+// A base class with custom new and delete operators. The exception-throwing
+// new operators are deleted. The "new (std::nothrow)" form must be used.
+//
+// The new operators return nullptr if the requested size is greater than
+// 0x40000000 bytes (1 GB). TODO(wtc): Make the maximum allocable memory size
+// a compile-time configuration macro.
+//
+// See https://en.cppreference.com/w/cpp/memory/new/operator_new and
+// https://en.cppreference.com/w/cpp/memory/new/operator_delete.
+//
+// NOTE: The allocation and deallocation functions are static member functions
+// whether the keyword 'static' is used or not.
+struct Allocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) = delete;
+  static void* operator new[](size_t size) = delete;
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+    return ::operator new(size, tag);
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+    return ::operator new[](size, tag);
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept { ::operator delete(ptr); }
+  static void operator delete[](void* ptr) noexcept {
+    ::operator delete[](ptr);
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+    ::operator delete(ptr, tag);
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+    ::operator delete[](ptr, tag);
+  }
+};
+
+// A variant of Allocable that forces allocations to be aligned to
+// kMaxAlignment bytes. This is intended for use with classes that use
+// alignas() with this value. C++17 aligned new/delete are used if available,
+// otherwise we use AlignedAlloc/Free.
+struct MaxAlignedAllocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) = delete;
+  static void* operator new[](size_t size) = delete;
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+    return ::operator new(size, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    return AlignedAlloc(kMaxAlignment, size);
+#endif
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+    return ::operator new[](size, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    return AlignedAlloc(kMaxAlignment, size);
+#endif
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr, std::align_val_t(kMaxAlignment));
+#else
+    AlignedFree(ptr);
+#endif
+  }
+  static void operator delete[](void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete[](ptr, std::align_val_t(kMaxAlignment));
+#else
+    AlignedFree(ptr);
+#endif
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    AlignedFree(ptr);
+#endif
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete[](ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    AlignedFree(ptr);
+#endif
+  }
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_MEMORY_H_
diff --git a/src/utils/memory_test.cc b/src/utils/memory_test.cc
new file mode 100644 (file)
index 0000000..42f6a15
--- /dev/null
@@ -0,0 +1,184 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/memory.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+struct Small : public Allocable {
+  uint8_t x;
+};
+
+struct Huge : public Allocable {
+  uint8_t x[kMaxAllocableSize + 1];
+};
+
+struct SmallMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x;
+};
+
+struct HugeMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct ThrowingConstructor : public Allocable {
+  ThrowingConstructor() { throw std::exception(); }
+
+  uint8_t x;
+};
+
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+  MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+  uint8_t x;
+};
+#endif
+
+TEST(MemoryTest, TestAlignedAllocFree) {
+  for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
+    void* p = AlignedAlloc(alignment, 1);
+    // Note this additional check is to avoid an incorrect static-analysis
+    // warning for leaked memory with a plain ASSERT_NE().
+    if (p == nullptr) {
+      FAIL() << "AlignedAlloc(" << alignment << ", 1)";
+    }
+    const auto p_value = reinterpret_cast<uintptr_t>(p);
+    EXPECT_EQ(p_value % alignment, 0)
+        << "AlignedAlloc(" << alignment << ", 1) = " << p;
+    AlignedFree(p);
+  }
+}
+
+TEST(MemoryTest, TestAlignedUniquePtrAlloc) {
+  for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
+    auto p = MakeAlignedUniquePtr<uint8_t>(alignment, 1);
+    ASSERT_NE(p, nullptr) << "MakeAlignedUniquePtr(" << alignment << ", 1)";
+    const auto p_value = reinterpret_cast<uintptr_t>(p.get());
+    EXPECT_EQ(p_value % alignment, 0)
+        << "MakeAlignedUniquePtr(" << alignment << ", 1) = " << p.get();
+  }
+}
+
+TEST(MemoryTest, TestAllocable) {
+  // Allocable::operator new (std::nothrow) is called.
+  std::unique_ptr<Small> small(new (std::nothrow) Small);
+  EXPECT_NE(small, nullptr);
+  // Allocable::operator delete is called.
+  small = nullptr;
+
+  // Allocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<Small[]> small_array_of_smalls(new (std::nothrow) Small[10]);
+  EXPECT_NE(small_array_of_smalls, nullptr);
+  // Allocable::operator delete[] is called.
+  small_array_of_smalls = nullptr;
+
+  // Allocable::operator new (std::nothrow) is called.
+  std::unique_ptr<Huge> huge(new (std::nothrow) Huge);
+  EXPECT_EQ(huge, nullptr);
+
+  // Allocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<Small[]> huge_array_of_smalls(
+      new (std::nothrow) Small[kMaxAllocableSize / sizeof(Small) + 1]);
+  EXPECT_EQ(huge_array_of_smalls, nullptr);
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    // Allocable::operator new (std::nothrow) is called.
+    // The constructor throws an exception.
+    // Allocable::operator delete (std::nothrow) is called.
+    ThrowingConstructor* always = new (std::nothrow) ThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // Allocable::operator new[] (std::nothrow) is called.
+    // The constructor throws an exception.
+    // Allocable::operator delete[] (std::nothrow) is called.
+    ThrowingConstructor* always = new (std::nothrow) ThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+#endif  // ABSL_HAVE_EXCEPTIONS
+}
+
+TEST(MemoryTest, TestMaxAlignedAllocable) {
+  // MaxAlignedAllocable::operator new (std::nothrow) is called.
+  std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+  EXPECT_NE(small, nullptr);
+  // Note this check doesn't guarantee conformance as a suitably aligned
+  // address may be returned from any allocator.
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1), 0);
+  // MaxAlignedAllocable::operator delete is called.
+  small = nullptr;
+
+  // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+      new (std::nothrow) SmallMaxAligned[10]);
+  EXPECT_NE(small_array_of_smalls, nullptr);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                (kMaxAlignment - 1),
+            0);
+  // MaxAlignedAllocable::operator delete[] is called.
+  small_array_of_smalls = nullptr;
+
+  // MaxAlignedAllocable::operator new (std::nothrow) is called.
+  std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+  EXPECT_EQ(huge, nullptr);
+
+  // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+      new (std::nothrow)
+          SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+  EXPECT_EQ(huge_array_of_smalls, nullptr);
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+#endif  // ABSL_HAVE_EXCEPTIONS
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/queue.h b/src/utils/queue.h
new file mode 100644 (file)
index 0000000..fcc7bfe
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// A FIFO queue of a fixed capacity.
+//
+// WARNING: No error checking is performed.
+template <typename T>
+class Queue {
+ public:
+  LIBGAV1_MUST_USE_RESULT bool Init(size_t capacity) {
+    elements_.reset(new (std::nothrow) T[capacity]);
+    if (elements_ == nullptr) return false;
+    capacity_ = capacity;
+    return true;
+  }
+
+  // Pushes the element |value| to the end of the queue. It is an error to call
+  // Push() when the queue is full.
+  void Push(T&& value) {
+    assert(size_ < capacity_);
+    elements_[end_++] = std::move(value);
+    if (end_ == capacity_) end_ = 0;
+    ++size_;
+  }
+
+  // Removes the element at the front of the queue. It is an error to call Pop()
+  // when the queue is empty.
+  void Pop() {
+    assert(size_ != 0);
+    const T element = std::move(elements_[begin_++]);
+    static_cast<void>(element);
+    if (begin_ == capacity_) begin_ = 0;
+    --size_;
+  }
+
+  // Returns a reference to the element at the front of the queue. It is an
+  // error to call Front() when the queue is empty.
+  T& Front() {
+    assert(size_ != 0);
+    return elements_[begin_];
+  }
+
+  // Returns a reference to the element at the back of the queue. It is an error
+  // to call Back() when the queue is empty.
+  T& Back() {
+    assert(size_ != 0);
+    const size_t back = ((end_ == 0) ? capacity_ : end_) - 1;
+    return elements_[back];
+  }
+
+  // Clears the queue.
+  void Clear() {
+    while (!Empty()) {
+      Pop();
+    }
+  }
+
+  // Returns true if the queue is empty.
+  bool Empty() const { return size_ == 0; }
+
+  // Returns true if the queue is full.
+  bool Full() const { return size_ >= capacity_; }
+
+  // Returns the number of elements in the queue.
+  size_t Size() const { return size_; }
+
+ private:
+  // An array of |capacity| elements. Used as a circular array.
+  std::unique_ptr<T[]> elements_;
+  size_t capacity_ = 0;
+  // The index of the element to be removed by Pop().
+  size_t begin_ = 0;
+  // The index where the new element is inserted by Push().
+  size_t end_ = 0;
+  size_t size_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_QUEUE_H_
diff --git a/src/utils/queue_test.cc b/src/utils/queue_test.cc
new file mode 100644 (file)
index 0000000..d84ae5f
--- /dev/null
@@ -0,0 +1,86 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/queue.h"
+
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+struct TestClass {
+  TestClass() = default;
+  explicit TestClass(int i) : i(i) {}
+  int i;
+  // The vector exists simply so that the class is not trivially copyable.
+  std::vector<int> dummy;
+};
+
+TEST(QueueTest, Basic) {
+  Queue<TestClass> queue;
+  ASSERT_TRUE(queue.Init(8));
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Full());
+    TestClass test(i);
+    queue.Push(std::move(test));
+    EXPECT_EQ(queue.Back().i, i);
+    EXPECT_FALSE(queue.Empty());
+  }
+  EXPECT_TRUE(queue.Full());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front().i, i);
+    queue.Pop();
+    EXPECT_FALSE(queue.Full());
+  }
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Full());
+    TestClass test(i);
+    queue.Push(std::move(test));
+    EXPECT_EQ(queue.Back().i, i);
+    EXPECT_FALSE(queue.Empty());
+  }
+  EXPECT_TRUE(queue.Full());
+  queue.Clear();
+  EXPECT_TRUE(queue.Empty());
+  EXPECT_FALSE(queue.Full());
+}
+
+TEST(QueueTest, WrapAround) {
+  Queue<TestClass> queue;
+  ASSERT_TRUE(queue.Init(8));
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_FALSE(queue.Full());
+    TestClass test(i);
+    queue.Push(std::move(test));
+    EXPECT_EQ(queue.Back().i, i);
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front().i, i);
+    queue.Pop();
+    EXPECT_TRUE(queue.Empty());
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/raw_bit_reader.cc b/src/utils/raw_bit_reader.cc
new file mode 100644 (file)
index 0000000..15e980d
--- /dev/null
@@ -0,0 +1,224 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <cassert>
+#include <limits>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+// Note <cinttypes> is only needed when logging is enabled (for the PRI*
+// macros). It depends on the definition of LIBGAV1_ENABLE_LOGGING from
+// logging.h, thus the non-standard header ordering.
+#if LIBGAV1_ENABLE_LOGGING
+#include <cinttypes>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaximumLeb128Size = 8;
+constexpr uint8_t kLeb128ValueByteMask = 0x7f;
+constexpr uint8_t kLeb128TerminationByteMask = 0x80;
+
+uint8_t Mod8(size_t n) {
+  // Last 3 bits are the value of mod 8.
+  return n & 0x07;
+}
+
+size_t DivideBy8(size_t n, bool ceil) { return (n + (ceil ? 7 : 0)) >> 3; }
+
+}  // namespace
+
+RawBitReader::RawBitReader(const uint8_t* data, size_t size)
+    : data_(data), bit_offset_(0), size_(size) {
+  assert(data_ != nullptr || size_ == 0);
+}
+
+int RawBitReader::ReadBitImpl() {
+  const size_t byte_offset = DivideBy8(bit_offset_, false);
+  const uint8_t byte = data_[byte_offset];
+  const uint8_t shift = 7 - Mod8(bit_offset_);
+  ++bit_offset_;
+  return static_cast<int>((byte >> shift) & 0x01);
+}
+
+int RawBitReader::ReadBit() {
+  if (Finished()) return -1;
+  return ReadBitImpl();
+}
+
+int64_t RawBitReader::ReadLiteral(int num_bits) {
+  assert(num_bits <= 32);
+  if (!CanReadLiteral(num_bits)) return -1;
+  assert(num_bits > 0);
+  uint32_t literal = 0;
+  int bit = num_bits - 1;
+  do {
+    // ARM can combine a shift operation with a constant number of bits with
+    // some other operations, such as the OR operation.
+    // Here is an ARM disassembly example:
+    // orr w1, w0, w1, lsl #1
+    // which left shifts register w1 by 1 bit and OR the shift result with
+    // register w0.
+    // The next 2 lines are equivalent to:
+    // literal |= static_cast<uint32_t>(ReadBitImpl()) << bit;
+    literal <<= 1;
+    literal |= static_cast<uint32_t>(ReadBitImpl());
+  } while (--bit >= 0);
+  return literal;
+}
+
+bool RawBitReader::ReadInverseSignedLiteral(int num_bits, int* const value) {
+  assert(num_bits + 1 < 32);
+  *value = static_cast<int>(ReadLiteral(num_bits + 1));
+  if (*value == -1) return false;
+  const int sign_bit = 1 << num_bits;
+  if ((*value & sign_bit) != 0) {
+    *value -= 2 * sign_bit;
+  }
+  return true;
+}
+
+bool RawBitReader::ReadLittleEndian(int num_bytes, size_t* const value) {
+  // We must be at a byte boundary.
+  assert(Mod8(bit_offset_) == 0);
+  assert(num_bytes <= 4);
+  static_assert(sizeof(size_t) >= 4, "");
+  if (value == nullptr) return false;
+  size_t byte_offset = DivideBy8(bit_offset_, false);
+  if (Finished() || byte_offset + num_bytes > size_) {
+    LIBGAV1_DLOG(ERROR, "Not enough bits to read Little Endian value.");
+    return false;
+  }
+  *value = 0;
+  for (int i = 0; i < num_bytes; ++i) {
+    const size_t byte = data_[byte_offset];
+    *value |= (byte << (i * 8));
+    ++byte_offset;
+  }
+  bit_offset_ = byte_offset * 8;
+  return true;
+}
+
+bool RawBitReader::ReadUnsignedLeb128(size_t* const value) {
+  // We must be at a byte boundary.
+  assert(Mod8(bit_offset_) == 0);
+  if (value == nullptr) return false;
+  uint64_t value64 = 0;
+  for (int i = 0; i < kMaximumLeb128Size; ++i) {
+    if (Finished()) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read LEB128 value.");
+      return false;
+    }
+    const size_t byte_offset = DivideBy8(bit_offset_, false);
+    const uint8_t byte = data_[byte_offset];
+    bit_offset_ += 8;
+    value64 |= static_cast<uint64_t>(byte & kLeb128ValueByteMask) << (i * 7);
+    if ((byte & kLeb128TerminationByteMask) == 0) {
+      if (value64 != static_cast<size_t>(value64) ||
+          value64 > std::numeric_limits<uint32_t>::max()) {
+        LIBGAV1_DLOG(
+            ERROR, "LEB128 value (%" PRIu64 ") exceeded uint32_t maximum (%u).",
+            value64, std::numeric_limits<uint32_t>::max());
+        return false;
+      }
+      *value = static_cast<size_t>(value64);
+      return true;
+    }
+  }
+  LIBGAV1_DLOG(
+      ERROR,
+      "Exceeded kMaximumLeb128Size (%d) when trying to read LEB128 value",
+      kMaximumLeb128Size);
+  return false;
+}
+
+bool RawBitReader::ReadUvlc(uint32_t* const value) {
+  if (value == nullptr) return false;
+  int leading_zeros = 0;
+  while (true) {
+    const int bit = ReadBit();
+    if (bit == -1) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+      return false;
+    }
+    if (bit == 1) break;
+    ++leading_zeros;
+    if (leading_zeros == 32) {
+      LIBGAV1_DLOG(ERROR,
+                   "Exceeded maximum size (32) when trying to read uvlc value");
+      return false;
+    }
+  }
+  int literal;
+  if (leading_zeros != 0) {
+    literal = static_cast<int>(ReadLiteral(leading_zeros));
+    if (literal == -1) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+      return false;
+    }
+    literal += (1U << leading_zeros) - 1;
+  } else {
+    literal = 0;
+  }
+  *value = literal;
+  return true;
+}
+
+bool RawBitReader::AlignToNextByte() {
+  while ((bit_offset_ & 7) != 0) {
+    if (ReadBit() != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool RawBitReader::VerifyAndSkipTrailingBits(size_t num_bits) {
+  if (ReadBit() != 1) return false;
+  for (size_t i = 0; i < num_bits - 1; ++i) {
+    if (ReadBit() != 0) return false;
+  }
+  return true;
+}
+
+bool RawBitReader::SkipBytes(size_t num_bytes) {
+  // If we are not at a byte boundary, return false.
+  return ((bit_offset_ & 7) != 0) ? false : SkipBits(num_bytes * 8);
+}
+
+bool RawBitReader::SkipBits(size_t num_bits) {
+  // If the reader is already finished, return false.
+  if (Finished()) return false;
+  // If skipping |num_bits| runs out of buffer, return false.
+  const size_t bit_offset = bit_offset_ + num_bits - 1;
+  if (DivideBy8(bit_offset, false) >= size_) return false;
+  bit_offset_ += num_bits;
+  return true;
+}
+
+bool RawBitReader::CanReadLiteral(size_t num_bits) const {
+  if (Finished()) return false;
+  const size_t bit_offset = bit_offset_ + num_bits - 1;
+  return DivideBy8(bit_offset, false) < size_;
+}
+
+bool RawBitReader::Finished() const {
+  return DivideBy8(bit_offset_, false) >= size_;
+}
+
+}  // namespace libgav1
diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h
new file mode 100644 (file)
index 0000000..da770d1
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+class RawBitReader final : public BitReader, public Allocable {
+ public:
+  RawBitReader(const uint8_t* data, size_t size);
+  ~RawBitReader() override = default;
+
+  int ReadBit() override;
+  int64_t ReadLiteral(int num_bits) override;  // f(n) in the spec.
+  bool ReadInverseSignedLiteral(int num_bits,
+                                int* value);  // su(1+num_bits) in the spec.
+  bool ReadLittleEndian(int num_bytes,
+                        size_t* value);    // le(n) in the spec.
+  bool ReadUnsignedLeb128(size_t* value);  // leb128() in the spec.
+  // Reads a variable length unsigned number and stores it in |*value|. On a
+  // successful return, |*value| is in the range of 0 to UINT32_MAX - 1,
+  // inclusive.
+  bool ReadUvlc(uint32_t* value);  // uvlc() in the spec.
+  bool Finished() const;
+  size_t bit_offset() const { return bit_offset_; }
+  // Return the bytes consumed so far (rounded up).
+  size_t byte_offset() const { return (bit_offset() + 7) >> 3; }
+  size_t size() const { return size_; }
+  // Move to the next byte boundary if not already at one. Return false if any
+  // of the bits being skipped over is non-zero. Return true otherwise. If this
+  // function returns false, the reader is left in an undefined state and must
+  // not be used further. section 5.3.5.
+  bool AlignToNextByte();
+  // Make sure that the trailing bits structure is as expected and skip over it.
+  // section 5.3.4.
+  bool VerifyAndSkipTrailingBits(size_t num_bits);
+  // Skip |num_bytes| bytes. This only works if the current position is at a
+  // byte boundary. The function returns false if the current position is not at
+  // a byte boundary or if skipping |num_bytes| causes the reader to run out of
+  // buffer. Returns true otherwise.
+  bool SkipBytes(size_t num_bytes);
+  // Skip |num_bits| bits. The function returns false if skipping |num_bits|
+  // causes the reader to run out of buffer. Returns true otherwise.
+  bool SkipBits(size_t num_bits);
+
+ private:
+  // Returns true if it is safe to read a literal of size |num_bits|.
+  bool CanReadLiteral(size_t num_bits) const;
+  int ReadBitImpl();
+
+  const uint8_t* const data_;
+  size_t bit_offset_;
+  const size_t size_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
diff --git a/src/utils/raw_bit_reader_test.cc b/src/utils/raw_bit_reader_test.cc
new file mode 100644 (file)
index 0000000..22a97a7
--- /dev/null
@@ -0,0 +1,580 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <bitset>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+namespace libgav1 {
+namespace {
+
+std::string IntegerToString(int x) { return std::bitset<8>(x).to_string(); }
+
+class RawBitReaderTest : public testing::TestWithParam<std::tuple<int, int>> {
+ protected:
+  RawBitReaderTest()
+      : literal_size_(std::get<0>(GetParam())),
+        test_data_size_(std::get<1>(GetParam())) {}
+
+  void CreateReader(const std::vector<uint8_t>& data) {
+    data_ = data;
+    raw_bit_reader_.reset(new (std::nothrow)
+                              RawBitReader(data_.data(), data_.size()));
+  }
+
+  void CreateReader(int size) {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    data_.clear();
+    for (int i = 0; i < size; ++i) {
+      data_.push_back(rnd.Rand8());
+    }
+    raw_bit_reader_.reset(new (std::nothrow)
+                              RawBitReader(data_.data(), data_.size()));
+  }
+
+  // Some tests don't depend on |literal_size_|. For those tests, return true if
+  // the |literal_size_| is greater than 1. If this function returns true, the
+  // test will abort.
+  bool RunOnlyOnce() const { return literal_size_ > 1; }
+
+  std::unique_ptr<RawBitReader> raw_bit_reader_;
+  std::vector<uint8_t> data_;
+  int literal_size_;
+  int test_data_size_;
+};
+
+TEST_P(RawBitReaderTest, ReadBit) {
+  if (RunOnlyOnce()) return;
+  CreateReader(test_data_size_);
+  for (const auto& value : data_) {
+    const std::string expected = IntegerToString(value);
+    for (int j = 0; j < 8; ++j) {
+      EXPECT_FALSE(raw_bit_reader_->Finished());
+      EXPECT_EQ(static_cast<int>(expected[j] == '1'),
+                raw_bit_reader_->ReadBit());
+    }
+  }
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteral) {
+  const int size_bytes = literal_size_;
+  const int size_bits = 8 * size_bytes;
+  CreateReader(test_data_size_ * size_bytes);
+  for (size_t i = 0; i < data_.size(); i += size_bytes) {
+    uint32_t expected_literal = 0;
+    for (int j = 0; j < size_bytes; ++j) {
+      expected_literal |=
+          static_cast<uint32_t>(data_[i + j] << (8 * (size_bytes - j - 1)));
+    }
+    EXPECT_FALSE(raw_bit_reader_->Finished());
+    const int64_t actual_literal = raw_bit_reader_->ReadLiteral(size_bits);
+    EXPECT_EQ(static_cast<int64_t>(expected_literal), actual_literal);
+    EXPECT_GE(actual_literal, 0);
+  }
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(10), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteral32BitsWithMsbSet) {
+  if (RunOnlyOnce()) return;
+  // Three 32-bit values with MSB set.
+  CreateReader({0xff, 0xff, 0xff, 0xff,    // 4294967295
+                0x80, 0xff, 0xee, 0xdd,    // 2164256477
+                0xa0, 0xaa, 0xbb, 0xcc});  // 2695543756
+  static constexpr int64_t expected_literals[] = {4294967295, 2164256477,
+                                                  2695543756};
+  for (const int64_t expected_literal : expected_literals) {
+    EXPECT_FALSE(raw_bit_reader_->Finished());
+    const int64_t actual_literal = raw_bit_reader_->ReadLiteral(32);
+    EXPECT_EQ(expected_literal, actual_literal);
+    EXPECT_GE(actual_literal, 0);
+  }
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(10), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteralNotEnoughBits) {
+  if (RunOnlyOnce()) return;
+  CreateReader(4);  // 32 bits.
+  EXPECT_GE(raw_bit_reader_->ReadLiteral(16), 0);
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(32), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteralMaxNumBits) {
+  if (RunOnlyOnce()) return;
+  CreateReader(4);  // 32 bits.
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(32), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadInverseSignedLiteral) {
+  if (RunOnlyOnce()) return;
+  // This is the only usage for this function in the decoding process. So
+  // testing just that case.
+  const int size_bits = 6;
+  data_.clear();
+  // Negative value followed by a positive value.
+  data_.push_back(0xd2);
+  data_.push_back(0xa4);
+  raw_bit_reader_.reset(new (std::nothrow)
+                            RawBitReader(data_.data(), data_.size()));
+  int value;
+  EXPECT_TRUE(raw_bit_reader_->ReadInverseSignedLiteral(size_bits, &value));
+  EXPECT_EQ(value, -23);
+  EXPECT_TRUE(raw_bit_reader_->ReadInverseSignedLiteral(size_bits, &value));
+  EXPECT_EQ(value, 41);
+  // We have only two bits left. Trying to read an inverse signed literal of 2
+  // bits actually needs 3 bits. So this should fail.
+  EXPECT_FALSE(raw_bit_reader_->ReadInverseSignedLiteral(2, &value));
+}
+
+TEST_P(RawBitReaderTest, ZeroSize) {
+  if (RunOnlyOnce()) return;
+  // Valid data, zero size.
+  data_.clear();
+  data_.push_back(0xf0);
+  raw_bit_reader_.reset(new (std::nothrow) RawBitReader(data_.data(), 0));
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(2), -1);
+  // NULL data, zero size.
+  raw_bit_reader_.reset(new (std::nothrow) RawBitReader(nullptr, 0));
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(2), -1);
+}
+
+TEST_P(RawBitReaderTest, AlignToNextByte) {
+  if (RunOnlyOnce()) return;
+  CreateReader({0x00, 0x00, 0x00, 0x0f});
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 0);
+  EXPECT_NE(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 1);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 1);
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 1);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(16), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 24);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 3);
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 24);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 3);
+  EXPECT_NE(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 25);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 4);
+  // Some bits are non-zero.
+  EXPECT_FALSE(raw_bit_reader_->AlignToNextByte());
+}
+
+TEST_P(RawBitReaderTest, VerifyAndSkipTrailingBits) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+
+  // 1 byte trailing byte.
+  data.push_back(0x80);
+  CreateReader(data);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+
+  // 2 byte trailing byte beginning at a byte-aligned offset.
+  data.clear();
+  data.push_back(0xf8);
+  data.push_back(0x80);
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(8), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+
+  // 2 byte trailing byte beginning at a non-byte-aligned offset.
+  data.clear();
+  data.push_back(0xf8);
+  data.push_back(0x00);
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(4), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 4);
+  EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(4));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+
+  // Invalid trailing byte at a byte-aligned offset.
+  data.clear();
+  data.push_back(0xf7);
+  data.push_back(0x70);
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(8), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+
+  // Invalid trailing byte at a non-byte-aligned offset.
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(4), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 4);
+  EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(12));
+
+  // No more data available.
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(16), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+}
+
+TEST_P(RawBitReaderTest, ReadLittleEndian) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  size_t actual;
+
+  // Invalid input.
+  data.push_back(0x00);  // dummy.
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadLittleEndian(1, nullptr));
+
+  // One byte value.
+  data.clear();
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(1, &actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // One byte value with leading bytes.
+  data.clear();
+  data.push_back(0x01);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(4, &actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(2, &actual));
+  EXPECT_EQ(actual, 473);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value with leading bytes.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x01);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(4, &actual));
+  EXPECT_EQ(actual, 473);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Not enough bytes.
+  data.clear();
+  data.push_back(0x01);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadLittleEndian(2, &actual));
+}
+
+TEST_P(RawBitReaderTest, ReadUnsignedLeb128) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  size_t actual;
+
+  // Invalid input.
+  data.push_back(0x00);  // dummy.
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(nullptr));
+
+  // One byte value.
+  data.clear();
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // One byte value with trailing bytes.
+  data.clear();
+  data.push_back(0x81);
+  data.push_back(0x80);
+  data.push_back(0x80);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 217);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value with trailing bytes.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x81);
+  data.push_back(0x80);
+  data.push_back(0x80);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 217);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 40);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Value > 32 bits.
+  data.clear();
+  for (int i = 0; i < 5; ++i) data.push_back(0xD9);
+  data.push_back(0x00);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+
+  // Not enough bytes (truncated leb128 value).
+  data.clear();
+  data.push_back(0x81);
+  data.push_back(0x81);
+  data.push_back(0x81);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+
+  // Exceeds kMaximumLeb128Size.
+  data.clear();
+  for (int i = 0; i < 10; ++i) data.push_back(0x80);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+}
+
+TEST_P(RawBitReaderTest, ReadUvlc) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  uint32_t actual;
+
+  // Invalid input.
+  data.push_back(0x00);  // dummy.
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(nullptr));
+
+  // Zero bit value.
+  data.clear();
+  data.push_back(0x80);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, 0);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 1);
+
+  // One bit value.
+  data.clear();
+  data.push_back(0x60);  // 011...
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, 2);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 3);
+
+  // Two bit value.
+  data.clear();
+  data.push_back(0x38);  // 00111...
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, 6);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 5);
+
+  // 31 bit value.
+  data.clear();
+  // (1 << 32) - 2 (= UINT32_MAX - 1) is the largest value that can be encoded
+  // as uvlc().
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x01);
+  data.push_back(0xFF);
+  data.push_back(0xFF);
+  data.push_back(0xFF);
+  data.push_back(0xFE);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, UINT32_MAX - 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 63);
+
+  // Not enough bits (truncated uvlc value).
+  data.clear();
+  data.push_back(0x07);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+
+  // 32 bits.
+  data.clear();
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0xFF);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+
+  // Exceeds 32 bits.
+  data.clear();
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x0F);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+}
+
+TEST_P(RawBitReaderTest, DecodeSignedSubexpWithReference) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  int actual;
+
+  data.push_back(0xa0);  // v = 5;
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 20, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 12);
+
+  data.clear();
+  data.push_back(0xd0);  // v = 6; extra_bit = 1;
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 20, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 11);
+
+  data.clear();
+  data.push_back(0xc8);  // subexp_more_bits = 1; v = 9;
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 40, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 27);
+
+  data.clear();
+  data.push_back(0x60);  // subexp_more_bits = 0; subexp_bits = 6.
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 40, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 18);
+
+  data.clear();
+  data.push_back(0x60);
+  CreateReader(data);
+  // Control is greater than 32, which makes b >= 32 in DecodeSubexp() and
+  // should return false.
+  EXPECT_FALSE(raw_bit_reader_->DecodeSignedSubexpWithReference(10, 40, 15, 35,
+                                                                &actual));
+}
+
+TEST_P(RawBitReaderTest, DecodeUniform) {
+  if (RunOnlyOnce()) return;
+  // Test the example from the AV1 spec, Section 4.10.7. ns(n).
+  // n = 5
+  // Value            ns(n) encoding
+  // -------------------------------
+  // 0                 00
+  // 1                 01
+  // 2                 10
+  // 3                110
+  // 4                111
+  //
+  // The five encoded values are concatenated into two bytes.
+  std::vector<uint8_t> data = {0x1b, 0x70};
+  CreateReader(data);
+  int actual;
+  for (int i = 0; i < 5; ++i) {
+    EXPECT_TRUE(raw_bit_reader_->DecodeUniform(5, &actual));
+    EXPECT_EQ(actual, i);
+  }
+
+  // If n is a power of 2, ns(n) is simply the log2(n)-bit representation of
+  // the unsigned number.
+  // Test n = 16.
+  // The 16 encoded values are concatenated into 8 bytes.
+  data = {0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
+  CreateReader(data);
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_TRUE(raw_bit_reader_->DecodeUniform(16, &actual));
+    EXPECT_EQ(actual, i);
+  }
+}
+
+TEST_P(RawBitReaderTest, SkipBytes) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data = {0x00, 0x00, 0x00, 0x00, 0x00};
+  CreateReader(data);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->SkipBytes(1));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_GE(raw_bit_reader_->ReadBit(), 0);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 9);
+  EXPECT_FALSE(raw_bit_reader_->SkipBytes(1));  // Not at a byte boundary.
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_FALSE(raw_bit_reader_->SkipBytes(10));  // Not enough bytes.
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->SkipBytes(3));
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+TEST_P(RawBitReaderTest, SkipBits) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data = {0x00, 0x00, 0x00, 0x00, 0x00};
+  CreateReader(data);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->SkipBits(8));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_GE(raw_bit_reader_->ReadBit(), 0);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 9);
+  EXPECT_TRUE(raw_bit_reader_->SkipBits(10));  // Not at a byte boundary.
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 19);
+  EXPECT_FALSE(raw_bit_reader_->SkipBits(80));  // Not enough bytes.
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 19);
+  EXPECT_TRUE(raw_bit_reader_->SkipBits(21));
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RawBitReaderTestInstance, RawBitReaderTest,
+    testing::Combine(testing::Range(1, 5),    // literal size.
+                     testing::Values(100)));  // number of bits/literals.
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/reference_info.h b/src/utils/reference_info.h
new file mode 100644 (file)
index 0000000..73c32d9
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This struct collects some members related to reference frames in one place to
+// make it easier to pass them as parameters to some dsp functions.
+struct ReferenceInfo {
+  // Initialize |motion_field_reference_frame| so that
+  // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when
+  // the updates are the same as the initialized value.
+  // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
+  // branch conditions in motion field projection.
+  // The following memory initialization of contiguous memory is very fast. It
+  // is not recommended to make the initialization multi-threaded, unless the
+  // memory which needs to be initialized in each thread is still contiguous.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) {
+    return motion_field_reference_frame.Reset(rows, columns,
+                                              /*zero_initialize=*/true) &&
+           motion_field_mv.Reset(
+               rows, columns,
+#if LIBGAV1_MSAN
+               // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only
+               // for qualified blocks. In MotionFieldProjectionKernel() dsp
+               // optimizations, it is read no matter it was set or not.
+               /*zero_initialize=*/true
+#else
+               /*zero_initialize=*/false
+#endif
+           );
+  }
+
+  // All members are used by inter frames only.
+  // For intra frames, they are not initialized.
+
+  std::array<uint8_t, kNumReferenceFrameTypes> order_hint;
+
+  // An example when |relative_distance_from| does not equal
+  // -|relative_distance_to|:
+  // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64
+  // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64
+  // This is why we need both |relative_distance_from| and
+  // |relative_distance_to|.
+  // |relative_distance_from|: Relative distances from reference frames to this
+  // frame.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from;
+  // |relative_distance_to|: Relative distances to reference frames.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to;
+
+  // Skip motion field projection of specific types of frames if their
+  // |relative_distance_to| is negative or too large.
+  std::array<bool, kNumReferenceFrameTypes> skip_references;
+  // Lookup table to get motion field projection division multiplier of specific
+  // types of frames. Derived from kProjectionMvDivisionLookup.
+  std::array<int16_t, kNumReferenceFrameTypes> projection_divisions;
+
+  // The current frame's |motion_field_reference_frame| and |motion_field_mv_|
+  // are guaranteed to be allocated only when refresh_frame_flags is not 0.
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<ReferenceFrameType> motion_field_reference_frame;
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<MotionVector> motion_field_mv;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
diff --git a/src/utils/segmentation.cc b/src/utils/segmentation.cc
new file mode 100644 (file)
index 0000000..75fa776
--- /dev/null
@@ -0,0 +1,31 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+namespace libgav1 {
+
+const int8_t kSegmentationFeatureBits[kSegmentFeatureMax] = {8, 6, 6, 6,
+                                                             6, 3, 0, 0};
+const int kSegmentationFeatureMaxValues[kSegmentFeatureMax] = {
+    255,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    7,
+    0,
+    0};
+
+}  // namespace libgav1
diff --git a/src/utils/segmentation.h b/src/utils/segmentation.h
new file mode 100644 (file)
index 0000000..67ff74c
--- /dev/null
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+extern const int8_t kSegmentationFeatureBits[kSegmentFeatureMax];
+extern const int kSegmentationFeatureMaxValues[kSegmentFeatureMax];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_SEGMENTATION_H_
diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc
new file mode 100644 (file)
index 0000000..bbf40c3
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cassert>
+#include <cstring>
+#include <new>
+
+namespace libgav1 {
+
+bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) {
+  if (rows4x4 * columns4x4 > rows4x4_ * columns4x4_) {
+    segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4 * columns4x4]);
+  }
+
+  rows4x4_ = rows4x4;
+  columns4x4_ = columns4x4;
+  if (segment_id_buffer_ == nullptr) return false;
+  segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get());
+  return true;
+}
+
+void SegmentationMap::Clear() {
+  memset(segment_id_buffer_.get(), 0, rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::CopyFrom(const SegmentationMap& from) {
+  assert(rows4x4_ == from.rows4x4_ && columns4x4_ == from.columns4x4_);
+  memcpy(segment_id_buffer_.get(), from.segment_id_buffer_.get(),
+         rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::FillBlock(int row4x4, int column4x4, int block_width4x4,
+                                int block_height4x4, int8_t segment_id) {
+  for (int y = 0; y < block_height4x4; ++y) {
+    memset(&segment_id_[row4x4 + y][column4x4], segment_id, block_width4x4);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/segmentation_map.h b/src/utils/segmentation_map.h
new file mode 100644 (file)
index 0000000..499be24
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// SegmentationMap stores the segment id associated with each 4x4 block in the
+// frame.
+class SegmentationMap {
+ public:
+  SegmentationMap() = default;
+
+  // Not copyable or movable
+  SegmentationMap(const SegmentationMap&) = delete;
+  SegmentationMap& operator=(const SegmentationMap&) = delete;
+
+  // Allocates an internal buffer of the given dimensions to hold the
+  // segmentation map. The memory in the buffer is not initialized. Returns
+  // true on success, false on failure (for example, out of memory).
+  LIBGAV1_MUST_USE_RESULT bool Allocate(int32_t rows4x4, int32_t columns4x4);
+
+  int8_t segment_id(int row4x4, int column4x4) const {
+    return segment_id_[row4x4][column4x4];
+  }
+
+  // Sets every element in the segmentation map to 0.
+  void Clear();
+
+  // Copies the entire segmentation map. |from| must be of the same dimensions.
+  void CopyFrom(const SegmentationMap& from);
+
+  // Sets the region of segmentation map covered by the block to |segment_id|.
+  // The block is located at |row4x4|, |column4x4| and has dimensions
+  // |block_width4x4| and |block_height4x4|.
+  void FillBlock(int row4x4, int column4x4, int block_width4x4,
+                 int block_height4x4, int8_t segment_id);
+
+ private:
+  int32_t rows4x4_ = 0;
+  int32_t columns4x4_ = 0;
+
+  // segment_id_ is a rows4x4_ by columns4x4_ 2D array. The underlying data
+  // buffer is dynamically allocated and owned by segment_id_buffer_.
+  std::unique_ptr<int8_t[]> segment_id_buffer_;
+  Array2DView<int8_t> segment_id_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
diff --git a/src/utils/segmentation_map_test.cc b/src/utils/segmentation_map_test.cc
new file mode 100644 (file)
index 0000000..4d8a7c9
--- /dev/null
@@ -0,0 +1,120 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(SegmentationMapTest, Clear) {
+  constexpr int32_t kRows4x4 = 60;
+  constexpr int32_t kColumns4x4 = 80;
+  SegmentationMap segmentation_map;
+  ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+  segmentation_map.Clear();
+  for (int row4x4 = 0; row4x4 < kRows4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kColumns4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 0);
+    }
+  }
+}
+
+TEST(SegmentationMapTest, FillBlock) {
+  constexpr int32_t kRows4x4 = 60;
+  constexpr int32_t kColumns4x4 = 80;
+  SegmentationMap segmentation_map;
+  ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+  // Fill the whole image with 2.
+  segmentation_map.FillBlock(0, 0, kColumns4x4, kRows4x4, 2);
+  // Fill a block with 1.
+  constexpr int kBlockWidth4x4 = 10;
+  constexpr int kBlockHeight4x4 = 20;
+  segmentation_map.FillBlock(4, 6, kBlockWidth4x4, kBlockHeight4x4, 1);
+  for (int row4x4 = 0; row4x4 < kRows4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kColumns4x4; ++column4x4) {
+      if (4 <= row4x4 && row4x4 < 4 + kBlockHeight4x4 && 6 <= column4x4 &&
+          column4x4 < 6 + kBlockWidth4x4) {
+        // Inside the block.
+        EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 1);
+      } else {
+        // Outside the block.
+        EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 2);
+      }
+    }
+  }
+}
+
+TEST(SegmentationMapTest, CopyFrom) {
+  constexpr int32_t kRows4x4 = 60;
+  constexpr int32_t kColumns4x4 = 80;
+  SegmentationMap segmentation_map;
+  ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+  // Split the segmentation map into four blocks of equal size.
+  constexpr int kBlockWidth4x4 = 40;
+  constexpr int kBlockHeight4x4 = 30;
+  segmentation_map.FillBlock(0, 0, kBlockWidth4x4, kBlockHeight4x4, 1);
+  segmentation_map.FillBlock(0, kBlockWidth4x4, kBlockWidth4x4, kBlockHeight4x4,
+                             2);
+  segmentation_map.FillBlock(kBlockHeight4x4, 0, kBlockWidth4x4,
+                             kBlockHeight4x4, 3);
+  segmentation_map.FillBlock(kBlockHeight4x4, kBlockWidth4x4, kBlockWidth4x4,
+                             kBlockHeight4x4, 4);
+
+  SegmentationMap segmentation_map2;
+  ASSERT_TRUE(segmentation_map2.Allocate(kRows4x4, kColumns4x4));
+  segmentation_map2.CopyFrom(segmentation_map);
+
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 1);
+      EXPECT_EQ(segmentation_map2.segment_id(row4x4, column4x4), 1);
+    }
+  }
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(row4x4, kBlockWidth4x4 + column4x4),
+                2);
+      EXPECT_EQ(
+          segmentation_map2.segment_id(row4x4, kBlockWidth4x4 + column4x4), 2);
+    }
+  }
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(
+          segmentation_map.segment_id(kBlockHeight4x4 + row4x4, column4x4), 3);
+      EXPECT_EQ(
+          segmentation_map2.segment_id(kBlockHeight4x4 + row4x4, column4x4), 3);
+    }
+  }
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(kBlockHeight4x4 + row4x4,
+                                            kBlockWidth4x4 + column4x4),
+                4);
+      EXPECT_EQ(segmentation_map2.segment_id(kBlockHeight4x4 + row4x4,
+                                             kBlockWidth4x4 + column4x4),
+                4);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/segmentation_test.cc b/src/utils/segmentation_test.cc
new file mode 100644 (file)
index 0000000..e985b2d
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/utils/common.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+int GetUnsignedBits(const unsigned int num_values) {
+  return (num_values > 0) ? FloorLog2(num_values) + 1 : 0;
+}
+
+// Check that kSegmentationFeatureBits and kSegmentationFeatureMaxValues are
+// consistent with each other.
+TEST(SegmentationTest, FeatureBitsAndMaxValuesConsistency) {
+  for (int feature = 0; feature < kSegmentFeatureMax; feature++) {
+    EXPECT_EQ(kSegmentationFeatureBits[feature],
+              GetUnsignedBits(kSegmentationFeatureMaxValues[feature]));
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/stack.h b/src/utils/stack.h
new file mode 100644 (file)
index 0000000..39133b9
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_STACK_H_
+#define LIBGAV1_SRC_UTILS_STACK_H_
+
+#include <cassert>
+#include <utility>
+
+namespace libgav1 {
+
+// A LIFO stack of a fixed capacity. The elements are moved using std::move, so
+// the element type T has to be movable.
+//
+// WARNING: No error checking is performed.
+template <typename T, int capacity>
+class Stack {
+ public:
+  // Pushes the element |value| to the top of the stack. It is an error to call
+  // Push() when the stack is full.
+  void Push(T value) {
+    ++top_;
+    assert(top_ < capacity);
+    elements_[top_] = std::move(value);
+  }
+
+  // Returns the element at the top of the stack and removes it from the stack.
+  // It is an error to call Pop() when the stack is empty.
+  T Pop() {
+    assert(top_ >= 0);
+    return std::move(elements_[top_--]);
+  }
+
+  // Returns true if the stack is empty.
+  bool Empty() const { return top_ < 0; }
+
+ private:
+  static_assert(capacity > 0, "");
+  T elements_[capacity];
+  // The array index of the top of the stack. The stack is empty if top_ is -1.
+  int top_ = -1;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_STACK_H_
diff --git a/src/utils/stack_test.cc b/src/utils/stack_test.cc
new file mode 100644 (file)
index 0000000..4de2ab6
--- /dev/null
@@ -0,0 +1,74 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/stack.h"
+
+#include <cstdint>
+#include <utility>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStackSize = 8;
+
+TEST(StackTest, SimpleType) {
+  Stack<int, kStackSize> stack;
+  EXPECT_TRUE(stack.Empty());
+
+  for (int i = 0; i < kStackSize; ++i) {
+    stack.Push(i);
+    EXPECT_FALSE(stack.Empty());
+  }
+
+  for (int i = kStackSize - 1; i >= 0; --i) {
+    EXPECT_EQ(stack.Pop(), i);
+  }
+  EXPECT_TRUE(stack.Empty());
+}
+
+TEST(StackTest, LargeStruct) {
+  struct LargeMoveOnlyStruct {
+    LargeMoveOnlyStruct() = default;
+    // Move only.
+    LargeMoveOnlyStruct(LargeMoveOnlyStruct&& other) = default;
+    LargeMoveOnlyStruct& operator=(LargeMoveOnlyStruct&& other) = default;
+
+    int32_t array1[1000];
+    uint64_t array2[2000];
+  };
+
+  Stack<LargeMoveOnlyStruct, kStackSize> stack;
+  EXPECT_TRUE(stack.Empty());
+
+  LargeMoveOnlyStruct large_move_only_struct[kStackSize];
+  for (int i = 0; i < kStackSize; ++i) {
+    LargeMoveOnlyStruct& l = large_move_only_struct[i];
+    l.array1[0] = i;
+    l.array2[0] = i;
+    stack.Push(std::move(l));
+    EXPECT_FALSE(stack.Empty());
+  }
+
+  for (int i = kStackSize - 1; i >= 0; --i) {
+    LargeMoveOnlyStruct l = stack.Pop();
+    EXPECT_EQ(l.array1[0], i);
+    EXPECT_EQ(l.array2[0], i);
+  }
+  EXPECT_TRUE(stack.Empty());
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc
new file mode 100644 (file)
index 0000000..6fa2e88
--- /dev/null
@@ -0,0 +1,328 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#if defined(_MSC_VER)
+#include <process.h>
+#include <windows.h>
+#else  // defined(_MSC_VER)
+#include <pthread.h>
+#endif  // defined(_MSC_VER)
+#if defined(__ANDROID__) || defined(__GLIBC__)
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <utility>
+
+#if defined(__ANDROID__)
+#include <chrono>  // NOLINT (unapproved c++11 header)
+#endif
+
+// Define the GetTid() function, a wrapper for the gettid() system call in
+// Linux.
+#if defined(__ANDROID__)
+static pid_t GetTid() { return gettid(); }
+#elif defined(__GLIBC__)
+// The glibc wrapper for the gettid() system call was added in glibc 2.30.
+// Emulate it for older versions of glibc.
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30)
+static pid_t GetTid() { return gettid(); }
+#else  // Older than glibc 2.30
+#include <sys/syscall.h>
+
+static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+#endif  // glibc 2.30 or later.
+#endif  // defined(__GLIBC__)
+
+namespace libgav1 {
+
+#if defined(__ANDROID__)
+namespace {
+
+using Clock = std::chrono::steady_clock;
+using Duration = Clock::duration;
+constexpr Duration kBusyWaitDuration =
+    std::chrono::duration_cast<Duration>(std::chrono::duration<double>(2e-3));
+
+}  // namespace
+#endif  // defined(__ANDROID__)
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(int num_threads) {
+  return Create(/*name_prefix=*/"", num_threads);
+}
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(const char name_prefix[],
+                                               int num_threads) {
+  if (name_prefix == nullptr || num_threads <= 0) return nullptr;
+  std::unique_ptr<WorkerThread*[]> threads(new (std::nothrow)
+                                               WorkerThread*[num_threads]);
+  if (threads == nullptr) return nullptr;
+  std::unique_ptr<ThreadPool> pool(new (std::nothrow) ThreadPool(
+      name_prefix, std::move(threads), num_threads));
+  if (pool != nullptr && !pool->StartWorkers()) {
+    pool = nullptr;
+  }
+  return pool;
+}
+
+ThreadPool::ThreadPool(const char name_prefix[],
+                       std::unique_ptr<WorkerThread*[]> threads,
+                       int num_threads)
+    : threads_(std::move(threads)), num_threads_(num_threads) {
+  threads_[0] = nullptr;
+  assert(name_prefix != nullptr);
+  const size_t name_prefix_len =
+      std::min(strlen(name_prefix), sizeof(name_prefix_) - 1);
+  memcpy(name_prefix_, name_prefix, name_prefix_len);
+  name_prefix_[name_prefix_len] = '\0';
+}
+
+ThreadPool::~ThreadPool() { Shutdown(); }
+
+void ThreadPool::Schedule(std::function<void()> closure) {
+  LockMutex();
+  if (!queue_.GrowIfNeeded()) {
+    // queue_ is full and we can't grow it. Run |closure| directly.
+    UnlockMutex();
+    closure();
+    return;
+  }
+  queue_.Push(std::move(closure));
+  UnlockMutex();
+  SignalOne();
+}
+
+int ThreadPool::num_threads() const { return num_threads_; }
+
+// A simple implementation that mirrors the non-portable Thread.  We may
+// choose to expand this in the future as a portable implementation of
+// Thread, or replace it at such a time as one is implemented.
+class ThreadPool::WorkerThread : public Allocable {
+ public:
+  // Creates and starts a thread that runs pool->WorkerFunction().
+  explicit WorkerThread(ThreadPool* pool);
+
+  // Not copyable or movable.
+  WorkerThread(const WorkerThread&) = delete;
+  WorkerThread& operator=(const WorkerThread&) = delete;
+
+  // REQUIRES: Join() must have been called if Start() was called and
+  // succeeded.
+  ~WorkerThread() = default;
+
+  LIBGAV1_MUST_USE_RESULT bool Start();
+
+  // Joins with the running thread.
+  void Join();
+
+ private:
+#if defined(_MSC_VER)
+  static unsigned int __stdcall ThreadBody(void* arg);
+#else
+  static void* ThreadBody(void* arg);
+#endif
+
+  void SetupName();
+  void Run();
+
+  ThreadPool* pool_;
+#if defined(_MSC_VER)
+  HANDLE handle_;
+#else
+  pthread_t thread_;
+#endif
+};
+
+ThreadPool::WorkerThread::WorkerThread(ThreadPool* pool) : pool_(pool) {}
+
+#if defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+  // Since our code calls the C run-time library (CRT), use _beginthreadex
+  // rather than CreateThread. Microsoft documentation says "If a thread
+  // created using CreateThread calls the CRT, the CRT may terminate the
+  // process in low-memory conditions."
+  uintptr_t handle = _beginthreadex(
+      /*security=*/nullptr, /*stack_size=*/0, ThreadBody, this,
+      /*initflag=*/CREATE_SUSPENDED, /*thrdaddr=*/nullptr);
+  if (handle == 0) return false;
+  handle_ = reinterpret_cast<HANDLE>(handle);
+  ResumeThread(handle_);
+  return true;
+}
+
+void ThreadPool::WorkerThread::Join() {
+  WaitForSingleObject(handle_, INFINITE);
+  CloseHandle(handle_);
+}
+
+unsigned int ThreadPool::WorkerThread::ThreadBody(void* arg) {
+  auto* thread = static_cast<WorkerThread*>(arg);
+  thread->Run();
+  return 0;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+  // Not currently supported on Windows.
+}
+
+#else  // defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+  return pthread_create(&thread_, nullptr, ThreadBody, this) == 0;
+}
+
+void ThreadPool::WorkerThread::Join() { pthread_join(thread_, nullptr); }
+
+void* ThreadPool::WorkerThread::ThreadBody(void* arg) {
+  auto* thread = static_cast<WorkerThread*>(arg);
+  thread->Run();
+  return nullptr;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+  if (pool_->name_prefix_[0] != '\0') {
+#if defined(__APPLE__)
+    // Apple's version of pthread_setname_np takes one argument and operates on
+    // the current thread only. Also, pthread_mach_thread_np is Apple-specific.
+    // The maximum size of the |name| buffer was noted in the Chromium source
+    // code and was confirmed by experiments.
+    char name[64];
+    mach_port_t id = pthread_mach_thread_np(pthread_self());
+    int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+                      static_cast<int64_t>(id));
+    assert(rv >= 0);
+    rv = pthread_setname_np(name);
+    assert(rv == 0);
+    static_cast<void>(rv);
+#elif defined(__ANDROID__) || (defined(__GLIBC__) && !defined(__GNU__))
+    // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
+    // with error 34 (ERANGE) on Android.
+    char name[16];
+    pid_t id = GetTid();
+    int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+                      static_cast<int64_t>(id));
+    assert(rv >= 0);
+    rv = pthread_setname_np(pthread_self(), name);
+    assert(rv == 0);
+    static_cast<void>(rv);
+#endif
+  }
+}
+
+#endif  // defined(_MSC_VER)
+
+void ThreadPool::WorkerThread::Run() {
+  SetupName();
+  pool_->WorkerFunction();
+}
+
+bool ThreadPool::StartWorkers() {
+  if (!queue_.Init()) return false;
+  for (int i = 0; i < num_threads_; ++i) {
+    threads_[i] = new (std::nothrow) WorkerThread(this);
+    if (threads_[i] == nullptr) return false;
+    if (!threads_[i]->Start()) {
+      delete threads_[i];
+      threads_[i] = nullptr;
+      return false;
+    }
+  }
+  return true;
+}
+
+void ThreadPool::WorkerFunction() {
+  LockMutex();
+  while (true) {
+    if (queue_.Empty()) {
+      if (exit_threads_) {
+        break;  // Queue is empty and exit was requested.
+      }
+#if defined(__ANDROID__)
+      // On android, if we go to a conditional wait right away, the CPU governor
+      // kicks in and starts shutting the cores down. So we do a very small busy
+      // wait to see if we get our next job within that period. This
+      // significantly improves the performance of common cases of tile parallel
+      // decoding. If we don't receive a job in the busy wait time, we then go
+      // to an actual conditional wait as usual.
+      UnlockMutex();
+      bool found_job = false;
+      const auto wait_start = Clock::now();
+      while (Clock::now() - wait_start < kBusyWaitDuration) {
+        LockMutex();
+        if (!queue_.Empty()) {
+          found_job = true;
+          break;
+        }
+        UnlockMutex();
+      }
+      // If |found_job| is true, we simply continue since we already hold the
+      // mutex and we know for sure that the |queue_| is not empty.
+      if (found_job) continue;
+      // Since |found_job_| was false, the mutex is not being held at this
+      // point.
+      LockMutex();
+      // Ensure that the queue is still empty.
+      if (!queue_.Empty()) continue;
+      if (exit_threads_) {
+        break;  // Queue is empty and exit was requested.
+      }
+#endif  // defined(__ANDROID__)
+      // Queue is still empty, wait for signal or broadcast.
+      Wait();
+    } else {
+      // Take a job from the queue.
+      std::function<void()> job = std::move(queue_.Front());
+      queue_.Pop();
+
+      UnlockMutex();
+      // Note that it is good practice to surround this with a try/catch so
+      // the thread pool doesn't go to hell if the job throws an exception.
+      // This is omitted here because Google3 doesn't like exceptions.
+      std::move(job)();
+      job = nullptr;
+
+      LockMutex();
+    }
+  }
+  UnlockMutex();
+}
+
+void ThreadPool::Shutdown() {
+  // Tell worker threads how to exit.
+  LockMutex();
+  exit_threads_ = true;
+  UnlockMutex();
+  SignalAll();
+
+  // Join all workers. This will block.
+  for (int i = 0; i < num_threads_; ++i) {
+    if (threads_[i] == nullptr) break;
+    threads_[i]->Join();
+    delete threads_[i];
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/threadpool.h b/src/utils/threadpool.h
new file mode 100644 (file)
index 0000000..fac875e
--- /dev/null
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_THREADPOOL_H_
+#define LIBGAV1_SRC_UTILS_THREADPOOL_H_
+
+#include <functional>
+#include <memory>
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if !defined(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+#if defined(__ANDROID__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE)
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 1
+#else
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 0
+#endif
+#endif
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <mutex>               // NOLINT (unapproved c++11 header)
+#else
+// absl::Mutex & absl::CondVar are significantly faster than the pthread
+// variants on platforms other than Android. iOS may deadlock on Shutdown()
+// using absl, see b/142251739.
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#endif
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+#include "src/utils/memory.h"
+#include "src/utils/unbounded_queue.h"
+
+namespace libgav1 {
+
+// An implementation of ThreadPool using POSIX threads (pthreads) or Windows
+// threads.
+//
+// - The pool allocates a fixed number of worker threads on instantiation.
+// - The worker threads will pick up work jobs as they arrive.
+// - If all workers are busy, work jobs are queued for later execution.
+//
+// The thread pool is shut down when the pool is destroyed.
+//
+// Example usage of the thread pool:
+//   {
+//     std::unique_ptr<ThreadPool> pool = ThreadPool::Create(4);
+//     for (int i = 0; i < 100; ++i) {  // Dispatch 100 jobs.
+//       pool->Schedule([&my_data]() { MyFunction(&my_data); });
+//     }
+//   } // ThreadPool gets destroyed only when all jobs are done.
+class ThreadPool : public Executor, public Allocable {
+ public:
+  // Creates the thread pool with the specified number of worker threads.
+  // If num_threads is 1, the closures are run in FIFO order.
+  static std::unique_ptr<ThreadPool> Create(int num_threads);
+
+  // Like the above factory method, but also sets the name prefix for threads.
+  static std::unique_ptr<ThreadPool> Create(const char name_prefix[],
+                                            int num_threads);
+
+  // The destructor will shut down the thread pool and all jobs are executed.
+  // Note that after shutdown, the thread pool does not accept further jobs.
+  ~ThreadPool() override;
+
+  // Adds the specified "closure" to the queue for processing. If worker threads
+  // are available, "closure" will run immediately. Otherwise "closure" is
+  // queued for later execution.
+  //
+  // NOTE: If the internal queue is full and cannot be resized because of an
+  // out-of-memory error, the current thread runs "closure" before returning
+  // from Schedule(). For our use cases, this seems better than the
+  // alternatives:
+  //   1. Return a failure status.
+  //   2. Have the current thread wait until the queue is not full.
+  void Schedule(std::function<void()> closure) override;
+
+  int num_threads() const;
+
+ private:
+  class WorkerThread;
+
+  // Creates the thread pool with the specified number of worker threads.
+  // If num_threads is 1, the closures are run in FIFO order.
+  ThreadPool(const char name_prefix[], std::unique_ptr<WorkerThread*[]> threads,
+             int num_threads);
+
+  // Starts the worker pool.
+  LIBGAV1_MUST_USE_RESULT bool StartWorkers();
+
+  void WorkerFunction();
+
+  // Shuts down the thread pool, i.e. worker threads finish their work and
+  // pick up new jobs until the queue is empty. This call will block until
+  // the shutdown is complete.
+  //
+  // Note: If a worker encounters an empty queue after this call, it will exit.
+  // Other workers might still be running, and if the queue fills up again, the
+  // thread pool will continue to operate with a decreased number of workers.
+  // It is up to the caller to prevent adding new jobs.
+  void Shutdown();
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  void LockMutex() { queue_mutex_.lock(); }
+  void UnlockMutex() { queue_mutex_.unlock(); }
+
+  void Wait() {
+    std::unique_lock<std::mutex> queue_lock(queue_mutex_, std::adopt_lock);
+    condition_.wait(queue_lock);
+    queue_lock.release();
+  }
+
+  void SignalOne() { condition_.notify_one(); }
+  void SignalAll() { condition_.notify_all(); }
+
+  std::condition_variable condition_;
+  std::mutex queue_mutex_;
+
+#else  // !LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  void LockMutex() ABSL_EXCLUSIVE_LOCK_FUNCTION() { queue_mutex_.Lock(); }
+  void UnlockMutex() ABSL_UNLOCK_FUNCTION() { queue_mutex_.Unlock(); }
+  void Wait() { condition_.Wait(&queue_mutex_); }
+  void SignalOne() { condition_.Signal(); }
+  void SignalAll() { condition_.SignalAll(); }
+
+  absl::CondVar condition_;
+  absl::Mutex queue_mutex_;
+
+#endif  // LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  UnboundedQueue<std::function<void()>> queue_ LIBGAV1_GUARDED_BY(queue_mutex_);
+  // If not all the worker threads are created, the first entry after the
+  // created worker threads is a null pointer.
+  const std::unique_ptr<WorkerThread*[]> threads_;
+
+  bool exit_threads_ LIBGAV1_GUARDED_BY(queue_mutex_) = false;
+  const int num_threads_ = 0;
+  // name_prefix_ is a C string, whose length is restricted to 16 characters,
+  // including the terminating null byte ('\0'). This restriction comes from
+  // the Linux pthread_setname_np() function.
+  char name_prefix_[16];
+};
+
+}  // namespace libgav1
+
+#undef LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+#endif  // LIBGAV1_SRC_UTILS_THREADPOOL_H_
diff --git a/src/utils/threadpool_test.cc b/src/utils/threadpool_test.cc
new file mode 100644 (file)
index 0000000..17854dc
--- /dev/null
@@ -0,0 +1,133 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+namespace {
+
+class SimpleGuardedInteger {
+ public:
+  explicit SimpleGuardedInteger(int initial_value) : value_(initial_value) {}
+  SimpleGuardedInteger(const SimpleGuardedInteger&) = delete;
+  SimpleGuardedInteger& operator=(const SimpleGuardedInteger&) = delete;
+
+  void Decrement() {
+    absl::MutexLock l(&mutex_);
+    assert(value_ >= 1);
+    --value_;
+    changed_.SignalAll();
+  }
+
+  void Increment() {
+    absl::MutexLock l(&mutex_);
+    ++value_;
+    changed_.SignalAll();
+  }
+
+  int Value() {
+    absl::MutexLock l(&mutex_);
+    return value_;
+  }
+
+  void WaitForZero() {
+    absl::MutexLock l(&mutex_);
+    while (value_ != 0) {
+      changed_.Wait(&mutex_);
+    }
+  }
+
+ private:
+  absl::Mutex mutex_;
+  absl::CondVar changed_;
+  int value_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+// Loops for |milliseconds| of wall-clock time.
+void LoopForMs(int64_t milliseconds) {
+  const absl::Time deadline = absl::Now() + absl::Milliseconds(milliseconds);
+  while (absl::Now() < deadline) {
+  }
+}
+
+// A function that increments the given integer.
+void IncrementIntegerJob(SimpleGuardedInteger* value) {
+  LoopForMs(100);
+  value->Increment();
+}
+
+TEST(ThreadPoolTest, ThreadedIntegerIncrement) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+  ASSERT_NE(thread_pool, nullptr);
+  EXPECT_EQ(thread_pool->num_threads(), 100);
+  SimpleGuardedInteger count(0);
+  for (int i = 0; i < 1000; ++i) {
+    thread_pool->Schedule([&count]() { IncrementIntegerJob(&count); });
+  }
+  thread_pool.reset(nullptr);
+  EXPECT_EQ(count.Value(), 1000);
+}
+
+// Test a ThreadPool via the Executor interface.
+TEST(ThreadPoolTest, ExecutorInterface) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+  ASSERT_NE(thread_pool, nullptr);
+  std::unique_ptr<Executor> executor(thread_pool.release());
+  SimpleGuardedInteger count(0);
+  for (int i = 0; i < 1000; ++i) {
+    executor->Schedule([&count]() { IncrementIntegerJob(&count); });
+  }
+  executor.reset(nullptr);
+  EXPECT_EQ(count.Value(), 1000);
+}
+
+TEST(ThreadPoolTest, DestroyWithoutUse) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+  EXPECT_NE(thread_pool, nullptr);
+  thread_pool.reset(nullptr);
+}
+
+// If num_threads is 0, ThreadPool::Create() should return a null pointer.
+TEST(ThreadPoolTest, NumThreadsZero) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(0);
+  EXPECT_EQ(thread_pool, nullptr);
+}
+
+// If num_threads is 1, the closures are run in FIFO order.
+TEST(ThreadPoolTest, OneThreadRunsClosuresFIFO) {
+  int count = 0;  // Declare first so that it outlives the thread pool.
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(1);
+  ASSERT_NE(pool, nullptr);
+  EXPECT_EQ(pool->num_threads(), 1);
+  for (int i = 0; i < 1000; ++i) {
+    pool->Schedule([&count, i]() {
+      EXPECT_EQ(count, i);
+      count++;
+    });
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/types.h b/src/utils/types.h
new file mode 100644 (file)
index 0000000..c2daf1f
--- /dev/null
@@ -0,0 +1,526 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_TYPES_H_
+#define LIBGAV1_SRC_UTILS_TYPES_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+union MotionVector {
+  // Motion vectors will always fit in int16_t and using int16_t here instead
+  // of int saves significant memory since some of the frame sized structures
+  // store motion vectors.
+  // Index 0 is the entry for row (horizontal direction) motion vector.
+  // Index 1 is the entry for column (vertical direction) motion vector.
+  int16_t mv[2];
+  // A uint32_t view into the |mv| array. Useful for cases where both the
+  // motion vectors have to be copied or compared with a single 32 bit
+  // instruction.
+  uint32_t mv32;
+};
+
+union CompoundMotionVector {
+  MotionVector mv[2];
+  // A uint64_t view into the |mv| array. Useful for cases where all the motion
+  // vectors have to be copied or compared with a single 64 bit instruction.
+  uint64_t mv64;
+};
+
+// Stores the motion information used for motion field estimation.
+struct TemporalMotionField : public Allocable {
+  Array2D<MotionVector> mv;
+  Array2D<int8_t> reference_offset;
+};
+
+// MvContexts contains the contexts used to decode portions of an inter block
+// mode info to set the y_mode field in BlockParameters.
+//
+// The contexts in the struct correspond to the ZeroMvContext, RefMvContext,
+// and NewMvContext variables in the spec.
+struct MvContexts {
+  int zero_mv;
+  int reference_mv;
+  int new_mv;
+};
+
+struct PaletteModeInfo {
+  uint8_t size[kNumPlaneTypes];
+  uint16_t color[kMaxPlanes][kMaxPaletteSize];
+};
+
+// Stores the parameters used by the prediction process. The members of the
+// struct are filled in when parsing the bitstream and used when the prediction
+// is computed. The information in this struct is associated with a single
+// block.
+// While both BlockParameters and PredictionParameters store information
+// pertaining to a Block, the only difference is that BlockParameters outlives
+// the block itself (for example, some of the variables in BlockParameters are
+// used to compute the context for reading elements in the subsequent blocks).
+struct PredictionParameters : public Allocable {
+  // Restore the index in the unsorted mv stack from the least 3 bits of sorted
+  // |weight_index_stack|.
+  const MotionVector& reference_mv(int stack_index) const {
+    return ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)];
+  }
+  const MotionVector& reference_mv(int stack_index, int mv_index) const {
+    return compound_ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)]
+        .mv[mv_index];
+  }
+
+  void IncreaseWeight(ptrdiff_t index, int weight) {
+    weight_index_stack[index] += weight << 3;
+  }
+
+  void SetWeightIndexStackEntry(int index, int weight) {
+    weight_index_stack[index] = (weight << 3) + 7 - index;
+  }
+
+  bool use_filter_intra;
+  FilterIntraPredictor filter_intra_mode;
+  int angle_delta[kNumPlaneTypes];
+  int8_t cfl_alpha_u;
+  int8_t cfl_alpha_v;
+  int max_luma_width;
+  int max_luma_height;
+  Array2D<uint8_t> color_index_map[kNumPlaneTypes];
+  bool use_intra_block_copy;
+  InterIntraMode inter_intra_mode;
+  bool is_wedge_inter_intra;
+  int wedge_index;
+  int wedge_sign;
+  bool mask_is_inverse;
+  MotionMode motion_mode;
+  CompoundPredictionType compound_prediction_type;
+  union {
+    // |ref_mv_stack| and |compound_ref_mv_stack| are not sorted after
+    // construction. reference_mv() must be called to get the correct element.
+    MotionVector ref_mv_stack[kMaxRefMvStackSize];
+    CompoundMotionVector compound_ref_mv_stack[kMaxRefMvStackSize];
+  };
+  // The least 3 bits of |weight_index_stack| store the index information, and
+  // the other bits store the weight. The index information is actually 7 -
+  // index to make the descending order sort stable (preserves the original
+  // order for elements with the same weight). Sorting an int16_t array is much
+  // faster than sorting a struct array with weight and index stored separately.
+  int16_t weight_index_stack[kMaxRefMvStackSize];
+  // In the spec, the weights of all the nearest mvs are incremented by a bonus
+  // weight which is larger than any natural weight, and later the weights of
+  // the mvs are compared with this bonus weight to determine their contexts. We
+  // replace this procedure by introducing |nearest_mv_count|, which records the
+  // count of the nearest mvs. Since all the nearest mvs are in the beginning of
+  // the mv stack, the index of a mv in the mv stack can be compared with
+  // |nearest_mv_count| to get that mv's context.
+  int nearest_mv_count;
+  int ref_mv_count;
+  int ref_mv_index;
+  MotionVector global_mv[2];
+  int num_warp_samples;
+  int warp_estimate_candidates[kMaxLeastSquaresSamples][4];
+  PaletteModeInfo palette_mode_info;
+  int8_t segment_id;  // segment_id is in the range [0, 7].
+  PredictionMode uv_mode;
+  bool chroma_top_uses_smooth_prediction;
+  bool chroma_left_uses_smooth_prediction;
+};
+
+// A lot of BlockParameters objects are created, so the smallest type is used
+// for each field. The ranges of some fields are documented to justify why
+// their types are large enough.
+struct BlockParameters : public Allocable {
+  BlockSize size;
+  bool skip;
+  bool is_inter;
+  PredictionMode y_mode;
+  TransformSize uv_transform_size;
+  InterpolationFilter interpolation_filter[2];
+  ReferenceFrameType reference_frame[2];
+  // The index of this array is as follows:
+  //  0 - Y plane vertical filtering.
+  //  1 - Y plane horizontal filtering.
+  //  2 - U plane (both directions).
+  //  3 - V plane (both directions).
+  uint8_t deblock_filter_level[kFrameLfCount];
+  CompoundMotionVector mv;
+  // When |Tile::split_parse_and_decode_| is true, each block gets its own
+  // instance of |prediction_parameters|. When it is false, all the blocks point
+  // to |Tile::prediction_parameters_|. This field is valid only as long as the
+  // block is *being* decoded. The lifetime and usage of this field can be
+  // better understood by following its flow in tile.cc.
+  std::unique_ptr<PredictionParameters> prediction_parameters;
+};
+
+// Used to store the left and top block parameters that are used for computing
+// the cdf context of the subsequent blocks.
+struct BlockCdfContext {
+  bool use_predicted_segment_id[32];
+  bool is_explicit_compound_type[32];  // comp_group_idx in the spec.
+  bool is_compound_type_average[32];   // compound_idx in the spec.
+  bool skip_mode[32];
+  uint8_t palette_size[kNumPlaneTypes][32];
+  uint16_t palette_color[32][kNumPlaneTypes][kMaxPaletteSize];
+  PredictionMode uv_mode[32];
+};
+
+// A five dimensional array used to store the wedge masks. The dimensions are:
+//   - block_size_index (returned by GetWedgeBlockSizeIndex() in prediction.cc).
+//   - flip_sign (0 or 1).
+//   - wedge_index (0 to 15).
+//   - each of those three dimensions is a 2d array of block_width by
+//     block_height.
+using WedgeMaskArray =
+    std::array<std::array<std::array<Array2D<uint8_t>, 16>, 2>, 9>;
+
+enum GlobalMotionTransformationType : uint8_t {
+  kGlobalMotionTransformationTypeIdentity,
+  kGlobalMotionTransformationTypeTranslation,
+  kGlobalMotionTransformationTypeRotZoom,
+  kGlobalMotionTransformationTypeAffine,
+  kNumGlobalMotionTransformationTypes
+};
+
+// Global motion and warped motion parameters. See the paper for more info:
+// S. Parker, Y. Chen, D. Barker, P. de Rivaz, D. Mukherjee, "Global and locally
+// adaptive warped motion compensation in video compression", Proc. IEEE
+// International Conference on Image Processing (ICIP), pp. 275-279, Sep. 2017.
+struct GlobalMotion {
+  GlobalMotionTransformationType type;
+  int32_t params[6];
+
+  // Represent two shearing operations. Computed from |params| by SetupShear().
+  //
+  // The least significant six (= kWarpParamRoundingBits) bits are all zeros.
+  // (This means alpha, beta, gamma, and delta could be represented by a 10-bit
+  // signed integer.) The minimum value is INT16_MIN (= -32768) and the maximum
+  // value is 32704 = 0x7fc0, the largest int16_t value whose least significant
+  // six bits are all zeros.
+  //
+  // Valid warp parameters (as validated by SetupShear()) have smaller ranges.
+  // Their absolute values are less than 2^14 (= 16384). (This follows from
+  // the warpValid check at the end of Section 7.11.3.6.)
+  //
+  // NOTE: Section 7.11.3.6 of the spec allows a maximum value of 32768, which
+  // is outside the range of int16_t. When cast to int16_t, 32768 becomes
+  // -32768. This potential int16_t overflow does not matter because either
+  // 32768 or -32768 causes SetupShear() to return false,
+  int16_t alpha;
+  int16_t beta;
+  int16_t gamma;
+  int16_t delta;
+};
+
+// Loop filter parameters:
+//
+// If level[0] and level[1] are both equal to 0, the loop filter process is
+// not invoked.
+//
+// |sharpness| and |delta_enabled| are only used by the loop filter process.
+//
+// The |ref_deltas| and |mode_deltas| arrays are used not only by the loop
+// filter process but also by the reference frame update and loading
+// processes. The loop filter process uses |ref_deltas| and |mode_deltas| only
+// when |delta_enabled| is true.
+struct LoopFilter {
+  // Contains loop filter strength values in the range of [0, 63].
+  std::array<int8_t, kFrameLfCount> level;
+  // Indicates the sharpness level in the range of [0, 7].
+  int8_t sharpness;
+  // Whether the filter level depends on the mode and reference frame used to
+  // predict a block.
+  bool delta_enabled;
+  // Whether additional syntax elements were read that specify which mode and
+  // reference frame deltas are to be updated. loop_filter_delta_update field in
+  // Section 5.9.11 of the spec.
+  bool delta_update;
+  // Contains the adjustment needed for the filter level based on the chosen
+  // reference frame, in the range of [-64, 63].
+  std::array<int8_t, kNumReferenceFrameTypes> ref_deltas;
+  // Contains the adjustment needed for the filter level based on the chosen
+  // mode, in the range of [-64, 63].
+  std::array<int8_t, kLoopFilterMaxModeDeltas> mode_deltas;
+};
+
+struct Delta {
+  bool present;
+  uint8_t scale;
+  bool multi;
+};
+
+struct Cdef {
+  uint8_t damping;  // damping value from the spec + (bitdepth - 8).
+  uint8_t bits;
+  // All the strength values are the values from the spec and left shifted by
+  // (bitdepth - 8).
+  uint8_t y_primary_strength[kMaxCdefStrengths];
+  uint8_t y_secondary_strength[kMaxCdefStrengths];
+  uint8_t uv_primary_strength[kMaxCdefStrengths];
+  uint8_t uv_secondary_strength[kMaxCdefStrengths];
+};
+
+struct TileInfo {
+  bool uniform_spacing;
+  int sb_rows;
+  int sb_columns;
+  int tile_count;
+  int tile_columns_log2;
+  int tile_columns;
+  int tile_column_start[kMaxTileColumns + 1];
+  // This field is not used by libgav1, but is populated for use by some
+  // hardware decoders. So it must not be removed.
+  int tile_column_width_in_superblocks[kMaxTileColumns + 1];
+  int tile_rows_log2;
+  int tile_rows;
+  int tile_row_start[kMaxTileRows + 1];
+  // This field is not used by libgav1, but is populated for use by some
+  // hardware decoders. So it must not be removed.
+  int tile_row_height_in_superblocks[kMaxTileRows + 1];
+  int16_t context_update_id;
+  uint8_t tile_size_bytes;
+};
+
+struct LoopRestoration {
+  LoopRestorationType type[kMaxPlanes];
+  int unit_size_log2[kMaxPlanes];
+};
+
+// Stores the quantization parameters of Section 5.9.12.
+struct QuantizerParameters {
+  // base_index is in the range [0, 255].
+  uint8_t base_index;
+  int8_t delta_dc[kMaxPlanes];
+  // delta_ac[kPlaneY] is always 0.
+  int8_t delta_ac[kMaxPlanes];
+  bool use_matrix;
+  // The |matrix_level| array is used only when |use_matrix| is true.
+  // matrix_level[plane] specifies the level in the quantizer matrix that
+  // should be used for decoding |plane|. The quantizer matrix has 15 levels,
+  // from 0 to 14. The range of matrix_level[plane] is [0, 15]. If
+  // matrix_level[plane] is 15, the quantizer matrix is not used.
+  int8_t matrix_level[kMaxPlanes];
+};
+
+// The corresponding segment feature constants in the AV1 spec are named
+// SEG_LVL_xxx.
+enum SegmentFeature : uint8_t {
+  kSegmentFeatureQuantizer,
+  kSegmentFeatureLoopFilterYVertical,
+  kSegmentFeatureLoopFilterYHorizontal,
+  kSegmentFeatureLoopFilterU,
+  kSegmentFeatureLoopFilterV,
+  kSegmentFeatureReferenceFrame,
+  kSegmentFeatureSkip,
+  kSegmentFeatureGlobalMv,
+  kSegmentFeatureMax
+};
+
+struct Segmentation {
+  // 5.11.14.
+  // Returns true if the feature is enabled in the segment.
+  bool FeatureActive(int segment_id, SegmentFeature feature) const {
+    return enabled && segment_id < kMaxSegments &&
+           feature_enabled[segment_id][feature];
+  }
+
+  // Returns true if the feature is signed.
+  static bool FeatureSigned(SegmentFeature feature) {
+    // Only the first five segment features are signed, so this comparison
+    // suffices.
+    return feature <= kSegmentFeatureLoopFilterV;
+  }
+
+  bool enabled;
+  bool update_map;
+  bool update_data;
+  bool temporal_update;
+  // True if the segment id will be read before the skip syntax element. False
+  // if the skip syntax element will be read first.
+  bool segment_id_pre_skip;
+  // The highest numbered segment id that has some enabled feature. Used as
+  // the upper bound for decoding segment ids.
+  int8_t last_active_segment_id;
+
+  bool feature_enabled[kMaxSegments][kSegmentFeatureMax];
+  int16_t feature_data[kMaxSegments][kSegmentFeatureMax];
+  bool lossless[kMaxSegments];
+  // Cached values of get_qindex(1, segmentId), to be consumed by
+  // Tile::ReadTransformType(). The values are in the range [0, 255].
+  uint8_t qindex[kMaxSegments];
+};
+
+// Section 6.8.20.
+// Note: In spec, film grain section uses YCbCr to denote variable names,
+// such as num_cb_points, num_cr_points. To keep it consistent with other
+// parts of code, we use YUV, i.e., num_u_points, num_v_points, etc.
+struct FilmGrainParams {
+  bool apply_grain;
+  bool update_grain;
+  bool chroma_scaling_from_luma;
+  bool overlap_flag;
+  bool clip_to_restricted_range;
+
+  uint8_t num_y_points;  // [0, 14].
+  uint8_t num_u_points;  // [0, 10].
+  uint8_t num_v_points;  // [0, 10].
+  // Must be [0, 255]. 10/12 bit /= 4 or 16. Must be in increasing order.
+  uint8_t point_y_value[14];
+  uint8_t point_y_scaling[14];
+  uint8_t point_u_value[10];
+  uint8_t point_u_scaling[10];
+  uint8_t point_v_value[10];
+  uint8_t point_v_scaling[10];
+
+  uint8_t chroma_scaling;             // grain_scaling_minus_8 + 8: [8, 11].
+  uint8_t auto_regression_coeff_lag;  // ar_coeff_lag: [0, 3].
+  // ar_coeffs_{y,u,v}_plus_128 - 128: [-128, 127].
+  int8_t auto_regression_coeff_y[24];
+  int8_t auto_regression_coeff_u[25];
+  int8_t auto_regression_coeff_v[25];
+  // Shift value: ar_coeff_shift_minus_6 + 6, auto regression coeffs range:
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  uint8_t auto_regression_shift;
+
+  uint16_t grain_seed;
+  int reference_index;
+  int grain_scale_shift;
+  int8_t u_multiplier;       // cb_mult - 128:      [-128, 127].
+  int8_t u_luma_multiplier;  // cb_luma_mult - 128: [-128, 127].
+  int16_t u_offset;          // cb_offset - 256:    [-256, 255].
+  int8_t v_multiplier;       // cr_mult - 128:      [-128, 127].
+  int8_t v_luma_multiplier;  // cr_luma_mult - 128: [-128, 127].
+  int16_t v_offset;          // cr_offset - 256:    [-256, 255].
+};
+
+struct ObuFrameHeader {
+  uint16_t display_frame_id;
+  uint16_t current_frame_id;
+  int64_t frame_offset;
+  uint16_t expected_frame_id[kNumInterReferenceFrameTypes];
+  int32_t width;
+  int32_t height;
+  int32_t columns4x4;
+  int32_t rows4x4;
+  // The render size (render_width and render_height) is a hint to the
+  // application about the desired display size. It has no effect on the
+  // decoding process.
+  int32_t render_width;
+  int32_t render_height;
+  int32_t upscaled_width;
+  LoopRestoration loop_restoration;
+  uint32_t buffer_removal_time[kMaxOperatingPoints];
+  uint32_t frame_presentation_time;
+  // Note: global_motion[0] (for kReferenceFrameIntra) is not used.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion;
+  TileInfo tile_info;
+  QuantizerParameters quantizer;
+  Segmentation segmentation;
+  bool show_existing_frame;
+  // frame_to_show is in the range [0, 7]. Only used if show_existing_frame is
+  // true.
+  int8_t frame_to_show;
+  FrameType frame_type;
+  bool show_frame;
+  bool showable_frame;
+  bool error_resilient_mode;
+  bool enable_cdf_update;
+  bool frame_size_override_flag;
+  // The order_hint syntax element in the uncompressed header. If
+  // show_existing_frame is false, the OrderHint variable in the spec is equal
+  // to this field, and so this field can be used in place of OrderHint when
+  // show_existing_frame is known to be false, such as during tile decoding.
+  uint8_t order_hint;
+  int8_t primary_reference_frame;
+  bool render_and_frame_size_different;
+  bool use_superres;
+  uint8_t superres_scale_denominator;
+  bool allow_screen_content_tools;
+  bool allow_intrabc;
+  bool frame_refs_short_signaling;
+  // A bitmask that specifies which reference frame slots will be updated with
+  // the current frame after it is decoded.
+  uint8_t refresh_frame_flags;
+  static_assert(sizeof(ObuFrameHeader::refresh_frame_flags) * 8 ==
+                    kNumReferenceFrameTypes,
+                "");
+  bool found_reference;
+  int8_t force_integer_mv;
+  bool allow_high_precision_mv;
+  InterpolationFilter interpolation_filter;
+  bool is_motion_mode_switchable;
+  bool use_ref_frame_mvs;
+  bool enable_frame_end_update_cdf;
+  // True if all segments are losslessly encoded at the coded resolution.
+  bool coded_lossless;
+  // True if all segments are losslessly encoded at the upscaled resolution.
+  bool upscaled_lossless;
+  TxMode tx_mode;
+  // True means that the mode info for inter blocks contains the syntax
+  // element comp_mode that indicates whether to use single or compound
+  // prediction. False means that all inter blocks will use single prediction.
+  bool reference_mode_select;
+  // The frames to use for compound prediction when skip_mode is true.
+  ReferenceFrameType skip_mode_frame[2];
+  bool skip_mode_present;
+  bool reduced_tx_set;
+  bool allow_warped_motion;
+  Delta delta_q;
+  Delta delta_lf;
+  // A valid value of reference_frame_index[i] is in the range [0, 7]. -1
+  // indicates an invalid value.
+  //
+  // NOTE: When the frame is an intra frame (frame_type is kFrameKey or
+  // kFrameIntraOnly), reference_frame_index is not used and may be
+  // uninitialized.
+  int8_t reference_frame_index[kNumInterReferenceFrameTypes];
+  // The ref_order_hint[ i ] syntax element in the uncompressed header.
+  // Specifies the expected output order hint for each reference frame.
+  uint8_t reference_order_hint[kNumReferenceFrameTypes];
+  LoopFilter loop_filter;
+  Cdef cdef;
+  FilmGrainParams film_grain_params;
+};
+
+// Structure used for traversing the partition tree.
+struct PartitionTreeNode {
+  PartitionTreeNode() = default;
+  PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size)
+      : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {}
+  int row4x4 = -1;
+  int column4x4 = -1;
+  BlockSize block_size = kBlockInvalid;
+};
+
+// Structure used for storing the transform parameters in a superblock.
+struct TransformParameters {
+  TransformParameters() = default;
+  TransformParameters(TransformType type, int non_zero_coeff_count)
+      : type(type), non_zero_coeff_count(non_zero_coeff_count) {}
+  TransformType type;
+  int non_zero_coeff_count;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_UTILS_TYPES_H_
diff --git a/src/utils/unbounded_queue.h b/src/utils/unbounded_queue.h
new file mode 100644 (file)
index 0000000..fa0d303
--- /dev/null
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+// A FIFO queue of an unbounded capacity.
+//
+// This implementation uses the general approach used in std::deque
+// implementations. See, for example,
+// https://stackoverflow.com/questions/6292332/what-really-is-a-deque-in-stl
+//
+// It is much simpler because it just needs to support the queue interface.
+// The blocks are chained into a circular list, not managed by a "map". It
+// does not shrink the internal buffer.
+//
+// An alternative implementation approach is a resizable circular array. See,
+// for example, ResizingArrayQueue.java in https://algs4.cs.princeton.edu/code/
+// and base::circular_deque in Chromium's base/containers library.
+template <typename T>
+class UnboundedQueue {
+ public:
+  UnboundedQueue() = default;
+
+  // Move only.
+  UnboundedQueue(UnboundedQueue&& other)
+      : first_block_(other.first_block_),
+        front_(other.front_),
+        last_block_(other.last_block_),
+        back_(other.back_) {
+    other.first_block_ = nullptr;
+    other.front_ = 0;
+    other.last_block_ = nullptr;
+    other.back_ = 0;
+  }
+  UnboundedQueue& operator=(UnboundedQueue&& other) {
+    if (this != &other) {
+      Destroy();
+      first_block_ = other.first_block_;
+      front_ = other.front_;
+      last_block_ = other.last_block_;
+      back_ = other.back_;
+      other.first_block_ = nullptr;
+      other.front_ = 0;
+      other.last_block_ = nullptr;
+      other.back_ = 0;
+    }
+    return *this;
+  }
+
+  ~UnboundedQueue() { Destroy(); }
+
+  // Allocates two Blocks upfront because most access patterns require at
+  // least two Blocks. Returns false if the allocation of the Blocks failed.
+  LIBGAV1_MUST_USE_RESULT bool Init() {
+    std::unique_ptr<Block> new_block0(new (std::nothrow) Block);
+    std::unique_ptr<Block> new_block1(new (std::nothrow) Block);
+    if (new_block0 == nullptr || new_block1 == nullptr) return false;
+    first_block_ = last_block_ = new_block0.release();
+    new_block1->next = first_block_;
+    last_block_->next = new_block1.release();
+    return true;
+  }
+
+  // Checks if the queue has room for a new element. If the queue is full,
+  // tries to grow it. Returns false if the queue is full and the attempt to
+  // grow it failed.
+  //
+  // NOTE: GrowIfNeeded() must be called before each call to Push(). This
+  // inconvenient design is necessary to guarantee a successful Push() call.
+  //
+  // Push(T&& value) is often called with the argument std::move(value). The
+  // moved-from object |value| won't be usable afterwards, so it would be
+  // problematic if Push(T&& value) failed and we lost access to the original
+  // |value| object.
+  LIBGAV1_MUST_USE_RESULT bool GrowIfNeeded() {
+    assert(last_block_ != nullptr);
+    if (back_ == kBlockCapacity) {
+      if (last_block_->next == first_block_) {
+        // All Blocks are in use.
+        std::unique_ptr<Block> new_block(new (std::nothrow) Block);
+        if (new_block == nullptr) return false;
+        new_block->next = first_block_;
+        last_block_->next = new_block.release();
+      }
+      last_block_ = last_block_->next;
+      back_ = 0;
+    }
+    return true;
+  }
+
+  // Pushes the element |value| to the end of the queue. It is an error to call
+  // Push() when the queue is full.
+  void Push(const T& value) {
+    assert(last_block_ != nullptr);
+    assert(back_ < kBlockCapacity);
+    T* elements = reinterpret_cast<T*>(last_block_->buffer);
+    new (&elements[back_++]) T(value);
+  }
+
+  void Push(T&& value) {
+    assert(last_block_ != nullptr);
+    assert(back_ < kBlockCapacity);
+    T* elements = reinterpret_cast<T*>(last_block_->buffer);
+    new (&elements[back_++]) T(std::move(value));
+  }
+
+  // Returns the element at the front of the queue. It is an error to call
+  // Front() when the queue is empty.
+  T& Front() {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    return elements[front_];
+  }
+
+  const T& Front() const {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    return elements[front_];
+  }
+
+  // Removes the element at the front of the queue from the queue. It is an
+  // error to call Pop() when the queue is empty.
+  void Pop() {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    elements[front_++].~T();
+    if (front_ == kBlockCapacity) {
+      // The first block has become empty.
+      front_ = 0;
+      if (first_block_ == last_block_) {
+        // Only one Block is in use. Simply reset back_.
+        back_ = 0;
+      } else {
+        first_block_ = first_block_->next;
+      }
+    }
+  }
+
+  // Returns true if the queue is empty.
+  bool Empty() const { return first_block_ == last_block_ && front_ == back_; }
+
+ private:
+  // kBlockCapacity is the maximum number of elements each Block can hold.
+  // sizeof(void*) is subtracted from 2048 to account for the |next| pointer in
+  // the Block struct.
+  //
+  // In Linux x86_64, sizeof(std::function<void()>) is 32, so each Block can
+  // hold 63 std::function<void()> objects.
+  //
+  // NOTE: The corresponding value in <deque> in libc++ revision
+  // 245b5ba3448b9d3f6de5962066557e253a6bc9a4 is:
+  //   template <class _ValueType, class _DiffType>
+  //   struct __deque_block_size {
+  //     static const _DiffType value =
+  //         sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16;
+  //   };
+  //
+  // Note that 4096 / 256 = 16, so apparently this expression is intended to
+  // ensure the block size is at least 4096 bytes and each block can hold at
+  // least 16 elements.
+  static constexpr size_t kBlockCapacity =
+      (sizeof(T) < 128) ? (2048 - sizeof(void*)) / sizeof(T) : 16;
+
+  struct Block : public Allocable {
+    alignas(T) char buffer[kBlockCapacity * sizeof(T)];
+    Block* next;
+  };
+
+  void Destroy() {
+    if (first_block_ == nullptr) return;  // An uninitialized queue.
+
+    // First free the unused blocks, which are located after last_block and
+    // before first_block_.
+    Block* block = last_block_->next;
+    // Cut the circular list open after last_block_.
+    last_block_->next = nullptr;
+    while (block != first_block_) {
+      Block* next = block->next;
+      delete block;
+      block = next;
+    }
+
+    // Then free the used blocks. Destruct the elements in the used blocks.
+    while (block != nullptr) {
+      const size_t begin = (block == first_block_) ? front_ : 0;
+      const size_t end = (block == last_block_) ? back_ : kBlockCapacity;
+      T* elements = reinterpret_cast<T*>(block->buffer);
+      for (size_t i = begin; i < end; ++i) {
+        elements[i].~T();
+      }
+      Block* next = block->next;
+      delete block;
+      block = next;
+    }
+  }
+
+  // Blocks are chained in a circular singly-linked list. If the list of Blocks
+  // is empty, both first_block_ and last_block_ are null pointers. If the list
+  // is nonempty, first_block_ points to the first used Block and last_block_
+  // points to the last used Block.
+  //
+  // Invariant: If Init() is called and succeeds, the queue is always nonempty.
+  // This allows all methods (except the destructor) to avoid null pointer
+  // checks for first_block_ and last_block_.
+  Block* first_block_ = nullptr;
+  // The index of the element in first_block_ to be removed by Pop().
+  size_t front_ = 0;
+  Block* last_block_ = nullptr;
+  // The index in last_block_ where the new element is inserted by Push().
+  size_t back_ = 0;
+};
+
+#if !LIBGAV1_CXX17
+template <typename T>
+constexpr size_t UnboundedQueue<T>::kBlockCapacity;
+#endif
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
diff --git a/src/utils/unbounded_queue_test.cc b/src/utils/unbounded_queue_test.cc
new file mode 100644 (file)
index 0000000..b107ad0
--- /dev/null
@@ -0,0 +1,163 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/unbounded_queue.h"
+
+#include <new>
+#include <utility>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+class Integer {
+ public:
+  explicit Integer(int value) : value_(new (std::nothrow) int{value}) {}
+
+  // Move only.
+  Integer(Integer&& other) : value_(other.value_) { other.value_ = nullptr; }
+  Integer& operator=(Integer&& other) {
+    if (this != &other) {
+      delete value_;
+      value_ = other.value_;
+      other.value_ = nullptr;
+    }
+    return *this;
+  }
+
+  ~Integer() { delete value_; }
+
+  int value() const { return *value_; }
+
+ private:
+  int* value_;
+};
+
+TEST(UnboundedQueueTest, Basic) {
+  UnboundedQueue<int> queue;
+  ASSERT_TRUE(queue.Init());
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_TRUE(queue.GrowIfNeeded());
+    queue.Push(i);
+    EXPECT_FALSE(queue.Empty());
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front(), i);
+    queue.Pop();
+  }
+  EXPECT_TRUE(queue.Empty());
+}
+
+TEST(UnboundedQueueTest, WrapAround) {
+  UnboundedQueue<int> queue;
+  ASSERT_TRUE(queue.Init());
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 1000; ++i) {
+    EXPECT_TRUE(queue.GrowIfNeeded());
+    queue.Push(i);
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front(), i);
+    queue.Pop();
+    EXPECT_TRUE(queue.Empty());
+  }
+}
+
+TEST(UnboundedQueueTest, EmptyBeforeInit) {
+  UnboundedQueue<int> queue;
+  EXPECT_TRUE(queue.Empty());
+}
+
+TEST(UnboundedQueueTest, LotsOfElements) {
+  UnboundedQueue<Integer> queue;
+  ASSERT_TRUE(queue.Init());
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 10000; ++i) {
+    Integer integer(i);
+    EXPECT_EQ(integer.value(), i);
+    EXPECT_TRUE(queue.GrowIfNeeded());
+    queue.Push(std::move(integer));
+    EXPECT_FALSE(queue.Empty());
+  }
+
+  for (int i = 0; i < 5000; ++i) {
+    EXPECT_FALSE(queue.Empty());
+    const Integer& integer = queue.Front();
+    EXPECT_EQ(integer.value(), i);
+    queue.Pop();
+  }
+  // Leave some elements in the queue to test destroying a nonempty queue.
+  EXPECT_FALSE(queue.Empty());
+}
+
+// Copy constructor and assignment are deleted, but move constructor and
+// assignment are OK.
+TEST(UnboundedQueueTest, Move) {
+  UnboundedQueue<int> ints1;
+  ASSERT_TRUE(ints1.Init());
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(2);
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(3);
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(5);
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(7);
+
+  // Move constructor.
+  UnboundedQueue<int> ints2(std::move(ints1));
+  EXPECT_EQ(ints2.Front(), 2);
+  ints2.Pop();
+  EXPECT_EQ(ints2.Front(), 3);
+  ints2.Pop();
+  EXPECT_EQ(ints2.Front(), 5);
+  ints2.Pop();
+  EXPECT_EQ(ints2.Front(), 7);
+  ints2.Pop();
+  EXPECT_TRUE(ints2.Empty());
+
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(11);
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(13);
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(17);
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(19);
+
+  // Move assignment.
+  UnboundedQueue<int> ints3;
+  ASSERT_TRUE(ints3.Init());
+  EXPECT_TRUE(ints3.GrowIfNeeded());
+  ints3.Push(23);
+  ints3 = std::move(ints2);
+  EXPECT_EQ(ints3.Front(), 11);
+  ints3.Pop();
+  EXPECT_EQ(ints3.Front(), 13);
+  ints3.Pop();
+  EXPECT_EQ(ints3.Front(), 17);
+  ints3.Pop();
+  EXPECT_EQ(ints3.Front(), 19);
+  ints3.Pop();
+  EXPECT_TRUE(ints3.Empty());
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/vector.h b/src/utils/vector.h
new file mode 100644 (file)
index 0000000..9a21aeb
--- /dev/null
@@ -0,0 +1,353 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// libgav1::Vector implementation
+
+#ifndef LIBGAV1_SRC_UTILS_VECTOR_H_
+#define LIBGAV1_SRC_UTILS_VECTOR_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace internal {
+
+static constexpr size_t kMinVectorAllocation = 16;
+
+// Returns the smallest power of two greater or equal to 'value'.
+inline size_t NextPow2(size_t value) {
+  if (value == 0) return 0;
+  --value;
+  for (size_t i = 1; i < sizeof(size_t) * 8; i *= 2) value |= value >> i;
+  return value + 1;
+}
+
+// Returns the smallest capacity greater or equal to 'value'.
+inline size_t NextCapacity(size_t value) {
+  if (value == 0) return 0;
+  if (value <= kMinVectorAllocation) return kMinVectorAllocation;
+  return NextPow2(value);
+}
+
+//------------------------------------------------------------------------------
+// Data structure equivalent to std::vector but returning false and to its last
+// valid state on memory allocation failure.
+// std::vector with a custom allocator does not fill this need without
+// exceptions.
+
+template <typename T>
+class VectorBase {
+ public:
+  using iterator = T*;
+  using const_iterator = const T*;
+
+  VectorBase() noexcept = default;
+  // Move only.
+  VectorBase(const VectorBase&) = delete;
+  VectorBase& operator=(const VectorBase&) = delete;
+  VectorBase(VectorBase&& other) noexcept
+      : items_(other.items_),
+        capacity_(other.capacity_),
+        num_items_(other.num_items_) {
+    other.items_ = nullptr;
+    other.capacity_ = 0;
+    other.num_items_ = 0;
+  }
+  VectorBase& operator=(VectorBase&& other) noexcept {
+    if (this != &other) {
+      clear();
+      free(items_);
+      items_ = other.items_;
+      capacity_ = other.capacity_;
+      num_items_ = other.num_items_;
+      other.items_ = nullptr;
+      other.capacity_ = 0;
+      other.num_items_ = 0;
+    }
+    return *this;
+  }
+  ~VectorBase() {
+    clear();
+    free(items_);
+  }
+
+  // Reallocates just enough memory if needed so that 'new_cap' items can fit.
+  LIBGAV1_MUST_USE_RESULT bool reserve(size_t new_cap) {
+    if (capacity_ < new_cap) {
+      T* const new_items = static_cast<T*>(malloc(new_cap * sizeof(T)));
+      if (new_items == nullptr) return false;
+      if (num_items_ > 0) {
+        if (std::is_trivial<T>::value) {
+          // Cast |new_items| and |items_| to void* to avoid the GCC
+          // -Wclass-memaccess warning and additionally the
+          // bugprone-undefined-memory-manipulation clang-tidy warning. The
+          // memcpy is safe because T is a trivial type.
+          memcpy(static_cast<void*>(new_items),
+                 static_cast<const void*>(items_), num_items_ * sizeof(T));
+        } else {
+          for (size_t i = 0; i < num_items_; ++i) {
+            new (&new_items[i]) T(std::move(items_[i]));
+            items_[i].~T();
+          }
+        }
+      }
+      free(items_);
+      items_ = new_items;
+      capacity_ = new_cap;
+    }
+    return true;
+  }
+
+  // Reallocates less memory so that only the existing items can fit.
+  bool shrink_to_fit() {
+    if (capacity_ == num_items_) return true;
+    if (num_items_ == 0) {
+      free(items_);
+      items_ = nullptr;
+      capacity_ = 0;
+      return true;
+    }
+    const size_t previous_capacity = capacity_;
+    capacity_ = 0;  // Force reserve() to allocate and copy.
+    if (reserve(num_items_)) return true;
+    capacity_ = previous_capacity;
+    return false;
+  }
+
+  // Constructs a new item by copy constructor. May reallocate if
+  // 'resize_if_needed'.
+  LIBGAV1_MUST_USE_RESULT bool push_back(const T& value,
+                                         bool resize_if_needed = true) {
+    if (num_items_ >= capacity_ &&
+        (!resize_if_needed ||
+         !reserve(internal::NextCapacity(num_items_ + 1)))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(value);
+    ++num_items_;
+    return true;
+  }
+
+  // Constructs a new item by copy constructor. reserve() must have been called
+  // with a sufficient capacity.
+  //
+  // WARNING: No error checking is performed.
+  void push_back_unchecked(const T& value) {
+    assert(num_items_ < capacity_);
+    new (&items_[num_items_]) T(value);
+    ++num_items_;
+  }
+
+  // Constructs a new item by move constructor. May reallocate if
+  // 'resize_if_needed'.
+  LIBGAV1_MUST_USE_RESULT bool push_back(T&& value,
+                                         bool resize_if_needed = true) {
+    if (num_items_ >= capacity_ &&
+        (!resize_if_needed ||
+         !reserve(internal::NextCapacity(num_items_ + 1)))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(std::move(value));
+    ++num_items_;
+    return true;
+  }
+
+  // Constructs a new item by move constructor. reserve() must have been called
+  // with a sufficient capacity.
+  //
+  // WARNING: No error checking is performed.
+  void push_back_unchecked(T&& value) {
+    assert(num_items_ < capacity_);
+    new (&items_[num_items_]) T(std::move(value));
+    ++num_items_;
+  }
+
+  // Constructs a new item in place by forwarding the arguments args... to the
+  // constructor. May reallocate.
+  template <typename... Args>
+  LIBGAV1_MUST_USE_RESULT bool emplace_back(Args&&... args) {
+    if (num_items_ >= capacity_ &&
+        !reserve(internal::NextCapacity(num_items_ + 1))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(std::forward<Args>(args)...);
+    ++num_items_;
+    return true;
+  }
+
+  // Destructs the last item.
+  void pop_back() {
+    --num_items_;
+    items_[num_items_].~T();
+  }
+
+  // Destructs the item at 'pos'.
+  void erase(iterator pos) { erase(pos, pos + 1); }
+
+  // Destructs the items in [first,last).
+  void erase(iterator first, iterator last) {
+    for (iterator it = first; it != last; ++it) it->~T();
+    if (last != end()) {
+      if (std::is_trivial<T>::value) {
+        // Cast |first| and |last| to void* to avoid the GCC
+        // -Wclass-memaccess warning and additionally the
+        // bugprone-undefined-memory-manipulation clang-tidy warning. The
+        // memmove is safe because T is a trivial type.
+        memmove(static_cast<void*>(first), static_cast<const void*>(last),
+                (end() - last) * sizeof(T));
+      } else {
+        for (iterator it_src = last, it_dst = first; it_src != end();
+             ++it_src, ++it_dst) {
+          new (it_dst) T(std::move(*it_src));
+          it_src->~T();
+        }
+      }
+    }
+    num_items_ -= std::distance(first, last);
+  }
+
+  // Destructs all the items.
+  void clear() { erase(begin(), end()); }
+
+  // Destroys (including deallocating) all the items.
+  void reset() {
+    clear();
+    if (!shrink_to_fit()) assert(false);
+  }
+
+  // Accessors
+  bool empty() const { return (num_items_ == 0); }
+  size_t size() const { return num_items_; }
+  size_t capacity() const { return capacity_; }
+
+  T* data() { return items_; }
+  T& front() { return items_[0]; }
+  T& back() { return items_[num_items_ - 1]; }
+  T& operator[](size_t i) { return items_[i]; }
+  T& at(size_t i) { return items_[i]; }
+  const T* data() const { return items_; }
+  const T& front() const { return items_[0]; }
+  const T& back() const { return items_[num_items_ - 1]; }
+  const T& operator[](size_t i) const { return items_[i]; }
+  const T& at(size_t i) const { return items_[i]; }
+
+  iterator begin() { return &items_[0]; }
+  const_iterator begin() const { return &items_[0]; }
+  iterator end() { return &items_[num_items_]; }
+  const_iterator end() const { return &items_[num_items_]; }
+
+  void swap(VectorBase& b) {
+    // Although not necessary here, adding "using std::swap;" and then calling
+    // swap() without namespace qualification is recommended. See Effective
+    // C++, Item 25.
+    using std::swap;
+    swap(items_, b.items_);
+    swap(capacity_, b.capacity_);
+    swap(num_items_, b.num_items_);
+  }
+
+ protected:
+  T* items_ = nullptr;
+  size_t capacity_ = 0;
+  size_t num_items_ = 0;
+};
+
+}  // namespace internal
+
+//------------------------------------------------------------------------------
+
+// Vector class that does *NOT* construct the content on resize().
+// Should be reserved to plain old data.
+template <typename T>
+class VectorNoCtor : public internal::VectorBase<T> {
+ public:
+  // Creates or destructs items so that 'new_num_items' exist.
+  // Allocated memory grows every power-of-two items.
+  LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+    using super = internal::VectorBase<T>;
+    if (super::num_items_ < new_num_items) {
+      if (super::capacity_ < new_num_items) {
+        if (!super::reserve(internal::NextCapacity(new_num_items))) {
+          return false;
+        }
+      }
+      super::num_items_ = new_num_items;
+    } else {
+      while (super::num_items_ > new_num_items) {
+        --super::num_items_;
+        super::items_[super::num_items_].~T();
+      }
+    }
+    return true;
+  }
+};
+
+// This generic vector class will call the constructors.
+template <typename T>
+class Vector : public internal::VectorBase<T> {
+ public:
+  // Constructs or destructs items so that 'new_num_items' exist.
+  // Allocated memory grows every power-of-two items.
+  LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+    using super = internal::VectorBase<T>;
+    if (super::num_items_ < new_num_items) {
+      if (super::capacity_ < new_num_items) {
+        if (!super::reserve(internal::NextCapacity(new_num_items))) {
+          return false;
+        }
+      }
+      while (super::num_items_ < new_num_items) {
+        new (&super::items_[super::num_items_]) T();
+        ++super::num_items_;
+      }
+    } else {
+      while (super::num_items_ > new_num_items) {
+        --super::num_items_;
+        super::items_[super::num_items_].~T();
+      }
+    }
+    return true;
+  }
+};
+
+//------------------------------------------------------------------------------
+
+// Define non-member swap() functions in the namespace in which VectorNoCtor
+// and Vector are implemented. See Effective C++, Item 25.
+
+template <typename T>
+void swap(VectorNoCtor<T>& a, VectorNoCtor<T>& b) {
+  a.swap(b);
+}
+
+template <typename T>
+void swap(Vector<T>& a, Vector<T>& b) {
+  a.swap(b);
+}
+
+//------------------------------------------------------------------------------
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_VECTOR_H_
diff --git a/src/utils/vector_test.cc b/src/utils/vector_test.cc
new file mode 100644 (file)
index 0000000..5b0127c
--- /dev/null
@@ -0,0 +1,234 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/vector.h"
+
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+class Foo {
+ public:
+  Foo() = default;
+
+  int x() const { return x_; }
+
+ private:
+  int x_ = 38;
+};
+
+class Point {
+ public:
+  Point(int x, int y) : x_(x), y_(y) {}
+
+  int x() const { return x_; }
+  int y() const { return y_; }
+
+ private:
+  int x_;
+  int y_;
+};
+
+TEST(VectorTest, NoCtor) {
+  VectorNoCtor<int> v;
+  EXPECT_TRUE(v.resize(100));
+  Vector<int> w;
+  EXPECT_TRUE(w.resize(100));
+
+#if LIBGAV1_MSAN
+  // Use MemorySanitizer to check VectorNoCtor::resize() does not initialize
+  // the memory while Vector::resize() does.
+  //
+  // __msan_test_shadow(const void *x, uptr size) returns the offset of the
+  // first (at least partially) poisoned byte in the range, or -1 if the whole
+  // range is good.
+  for (size_t i = 0; i < 100; ++i) {
+    EXPECT_EQ(__msan_test_shadow(&v[i], sizeof(int)), 0);
+    EXPECT_EQ(__msan_test_shadow(&w[i], sizeof(int)), -1);
+  }
+#endif
+}
+
+TEST(VectorTest, Constructor) {
+  Vector<Foo> v;
+  EXPECT_TRUE(v.resize(100));
+  for (const Foo& foo : v) {
+    EXPECT_EQ(foo.x(), 38);
+  }
+}
+
+TEST(VectorTest, PushBack) {
+  // Create a vector containing integers
+  Vector<int> v;
+  EXPECT_TRUE(v.reserve(8));
+  EXPECT_EQ(v.size(), 0);
+
+  EXPECT_TRUE(v.push_back(25));
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v[0], 25);
+
+  EXPECT_TRUE(v.push_back(13));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(v[0], 25);
+  EXPECT_EQ(v[1], 13);
+}
+
+TEST(VectorTest, PushBackUnchecked) {
+  Vector<std::unique_ptr<Point>> v;
+  EXPECT_TRUE(v.reserve(2));
+  EXPECT_EQ(v.size(), 0);
+
+  std::unique_ptr<Point> point(new (std::nothrow) Point(1, 2));
+  EXPECT_NE(point, nullptr);
+  v.push_back_unchecked(std::move(point));
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v[0]->x(), 1);
+  EXPECT_EQ(v[0]->y(), 2);
+
+  point.reset(new (std::nothrow) Point(3, 4));
+  EXPECT_NE(point, nullptr);
+  v.push_back_unchecked(std::move(point));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(v[0]->x(), 1);
+  EXPECT_EQ(v[0]->y(), 2);
+  EXPECT_EQ(v[1]->x(), 3);
+  EXPECT_EQ(v[1]->y(), 4);
+}
+
+TEST(VectorTest, EmplaceBack) {
+  Vector<Point> v;
+  EXPECT_EQ(v.size(), 0);
+
+  EXPECT_TRUE(v.emplace_back(1, 2));
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v[0].x(), 1);
+  EXPECT_EQ(v[0].y(), 2);
+
+  EXPECT_TRUE(v.emplace_back(3, 4));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(v[0].x(), 1);
+  EXPECT_EQ(v[0].y(), 2);
+  EXPECT_EQ(v[1].x(), 3);
+  EXPECT_EQ(v[1].y(), 4);
+}
+
+// Copy constructor and assignment are deleted, but move constructor and
+// assignment are OK.
+TEST(VectorTest, Move) {
+  Vector<int> ints1;
+  EXPECT_TRUE(ints1.reserve(4));
+  EXPECT_TRUE(ints1.push_back(2));
+  EXPECT_TRUE(ints1.push_back(3));
+  EXPECT_TRUE(ints1.push_back(5));
+  EXPECT_TRUE(ints1.push_back(7));
+
+  // Move constructor.
+  Vector<int> ints2(std::move(ints1));
+  EXPECT_EQ(ints2.size(), 4);
+  EXPECT_EQ(ints2[0], 2);
+  EXPECT_EQ(ints2[1], 3);
+  EXPECT_EQ(ints2[2], 5);
+  EXPECT_EQ(ints2[3], 7);
+
+  // Move assignment.
+  Vector<int> ints3;
+  EXPECT_TRUE(ints3.reserve(1));
+  EXPECT_TRUE(ints3.push_back(11));
+  ints3 = std::move(ints2);
+  EXPECT_EQ(ints3.size(), 4);
+  EXPECT_EQ(ints3[0], 2);
+  EXPECT_EQ(ints3[1], 3);
+  EXPECT_EQ(ints3[2], 5);
+  EXPECT_EQ(ints3[3], 7);
+}
+
+TEST(VectorTest, Erase) {
+  Vector<int> ints;
+  EXPECT_TRUE(ints.reserve(4));
+  EXPECT_TRUE(ints.push_back(2));
+  EXPECT_TRUE(ints.push_back(3));
+  EXPECT_TRUE(ints.push_back(5));
+  EXPECT_TRUE(ints.push_back(7));
+
+  EXPECT_EQ(ints.size(), 4);
+  EXPECT_EQ(ints[0], 2);
+  EXPECT_EQ(ints[1], 3);
+  EXPECT_EQ(ints[2], 5);
+  EXPECT_EQ(ints[3], 7);
+
+  ints.erase(ints.begin());
+  EXPECT_EQ(ints.size(), 3);
+  EXPECT_EQ(ints[0], 3);
+  EXPECT_EQ(ints[1], 5);
+  EXPECT_EQ(ints[2], 7);
+}
+
+TEST(VectorTest, EraseNonTrivial) {
+  // A simple class that sets an int value to 0 in the destructor.
+  class Cleaner {
+   public:
+    explicit Cleaner(int* value) : value_(value) {}
+    ~Cleaner() { *value_ = 0; }
+
+    int value() const { return *value_; }
+
+   private:
+    int* value_;
+  };
+  int value1 = 100;
+  int value2 = 200;
+  Vector<std::unique_ptr<Cleaner>> v;
+  EXPECT_TRUE(v.reserve(2));
+  EXPECT_EQ(v.capacity(), 2);
+
+  std::unique_ptr<Cleaner> c(new (std::nothrow) Cleaner(&value1));
+  EXPECT_NE(c, nullptr);
+  EXPECT_TRUE(v.push_back(std::move(c)));
+  c.reset(new (std::nothrow) Cleaner(&value2));
+  EXPECT_NE(c, nullptr);
+  EXPECT_TRUE(v.push_back(std::move(c)));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(value1, 100);
+  EXPECT_EQ(value2, 200);
+
+  v.erase(v.begin());
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v.capacity(), 2);
+  EXPECT_EQ(value1, 0);
+  EXPECT_EQ(value2, 200);
+  EXPECT_EQ(v[0].get()->value(), value2);
+
+  EXPECT_TRUE(v.shrink_to_fit());
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v.capacity(), 1);
+  EXPECT_EQ(value2, 200);
+  EXPECT_EQ(v[0].get()->value(), value2);
+
+  v.clear();
+  EXPECT_TRUE(v.empty());
+  EXPECT_EQ(value2, 0);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/version.cc b/src/version.cc
new file mode 100644 (file)
index 0000000..8d1e5a9
--- /dev/null
@@ -0,0 +1,39 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#define LIBGAV1_TOSTRING(x) #x
+#define LIBGAV1_STRINGIFY(x) LIBGAV1_TOSTRING(x)
+#define LIBGAV1_DOT_SEPARATED(M, m, p) M##.##m##.##p
+#define LIBGAV1_DOT_SEPARATED_VERSION(M, m, p) LIBGAV1_DOT_SEPARATED(M, m, p)
+#define LIBGAV1_DOT_VERSION                                                   \
+  LIBGAV1_DOT_SEPARATED_VERSION(LIBGAV1_MAJOR_VERSION, LIBGAV1_MINOR_VERSION, \
+                                LIBGAV1_PATCH_VERSION)
+
+#define LIBGAV1_VERSION_STRING LIBGAV1_STRINGIFY(LIBGAV1_DOT_VERSION)
+
+extern "C" {
+
+int Libgav1GetVersion() { return LIBGAV1_VERSION; }
+const char* Libgav1GetVersionString() { return LIBGAV1_VERSION_STRING; }
+
+const char* Libgav1GetBuildConfiguration() {
+  // TODO(jzern): cmake can generate the detail or in other cases we could
+  // produce one based on the known defines along with the defaults based on
+  // the toolchain, e.g., LIBGAV1_ENABLE_NEON from cpu.h.
+  return "Not available.";
+}
+
+}  // extern "C"
diff --git a/src/version_test.cc b/src/version_test.cc
new file mode 100644 (file)
index 0000000..aaa5e1c
--- /dev/null
@@ -0,0 +1,66 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#include <regex>  // NOLINT (unapproved c++11 header)
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(VersionTest, GetVersion) {
+  const int library_version = GetVersion();
+  EXPECT_EQ((library_version >> 24) & 0xff, 0);
+  // Note if we link against a shared object there's potential for a mismatch
+  // if a different library is loaded at runtime.
+  EXPECT_EQ((library_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  EXPECT_EQ((library_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  EXPECT_EQ(library_version & 0xff, LIBGAV1_PATCH_VERSION);
+
+  const int header_version = LIBGAV1_VERSION;
+  EXPECT_EQ((header_version >> 24) & 0xff, 0);
+  EXPECT_EQ((header_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  EXPECT_EQ((header_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  EXPECT_EQ(header_version & 0xff, LIBGAV1_PATCH_VERSION);
+}
+
+TEST(VersionTest, GetVersionString) {
+  const char* version = GetVersionString();
+  ASSERT_NE(version, nullptr);
+  // https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
+  const std::regex semver_regex(
+      R"(^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*))"
+      R"((?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))"
+      R"((?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?)"
+      R"((?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$)");
+
+  EXPECT_TRUE(std::regex_match(version, semver_regex)) << version;
+  // Regex validation:
+  // It shouldn't accept a version starting with a non-digit.
+  version = "v1.2.3";
+  EXPECT_FALSE(std::regex_match(version, semver_regex)) << version;
+  // It shouldn't accept a version with spaces."
+  version = "1.2.3 alpha";
+  EXPECT_FALSE(std::regex_match(version, semver_regex)) << version;
+}
+
+TEST(VersionTest, GetBuildConfiguration) {
+  const char* config = GetBuildConfiguration();
+  ASSERT_NE(config, nullptr);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/warp_prediction.cc b/src/warp_prediction.cc
new file mode 100644 (file)
index 0000000..0da8a1f
--- /dev/null
@@ -0,0 +1,237 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWarpModelTranslationClamp = 1 << 23;
+constexpr int kWarpModelAffineClamp = 1 << 13;
+constexpr int kLargestMotionVectorDiff = 256;
+
+constexpr uint16_t kDivisorLookup[257] = {
+    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+    15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+    15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+    14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+    13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+    13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+    13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+    12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+    12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+    11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+    11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+    11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+    10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+    10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+    10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+    9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+    9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+    9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+    9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+    9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+    8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+    8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+    8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+    8240,  8224,  8208,  8192};
+
+// Number of fractional bits of lookup in divisor lookup table.
+constexpr int kDivisorLookupBits = 8;
+// Number of fractional bits of entries in divisor lookup table.
+constexpr int kDivisorLookupPrecisionBits = 14;
+
+// 7.11.3.7.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+                                int16_t* division_shift) {
+  const int n = FloorLog2(std::abs(value));
+  const T e = std::abs(value) - (static_cast<T>(1) << n);
+  const int entry = (n > kDivisorLookupBits)
+                        ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+                        : static_cast<int>(e << (kDivisorLookupBits - n));
+  *division_shift = n + kDivisorLookupPrecisionBits;
+  *division_factor =
+      (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// 7.11.3.8.
+int LeastSquareProduct(int a, int b) { return ((a * b) >> 2) + a + b; }
+
+// 7.11.3.8.
+int DiagonalClamp(int32_t value) {
+  return Clip3(value,
+               (1 << kWarpedModelPrecisionBits) - kWarpModelAffineClamp + 1,
+               (1 << kWarpedModelPrecisionBits) + kWarpModelAffineClamp - 1);
+}
+
+// 7.11.3.8.
+int NonDiagonalClamp(int32_t value) {
+  return Clip3(value, -kWarpModelAffineClamp + 1, kWarpModelAffineClamp - 1);
+}
+
+int16_t GetShearParameter(int value) {
+  return static_cast<int16_t>(
+      LeftShift(RightShiftWithRoundingSigned(Clip3(value, INT16_MIN, INT16_MAX),
+                                             kWarpParamRoundingBits),
+                kWarpParamRoundingBits));
+}
+
+}  // namespace
+
+bool SetupShear(GlobalMotion* const warp_params) {
+  int16_t division_shift;
+  int16_t division_factor;
+  const auto* const params = warp_params->params;
+  GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+                                      &division_shift);
+  const int alpha = params[2] - (1 << kWarpedModelPrecisionBits);
+  const int beta = params[3];
+  const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+  const int gamma =
+      RightShiftWithRoundingSigned(v * division_factor, division_shift);
+  const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+  const int delta =
+      params[5] -
+      RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+      (1 << kWarpedModelPrecisionBits);
+
+  warp_params->alpha = GetShearParameter(alpha);
+  warp_params->beta = GetShearParameter(beta);
+  warp_params->gamma = GetShearParameter(gamma);
+  warp_params->delta = GetShearParameter(delta);
+  if ((4 * std::abs(warp_params->alpha) + 7 * std::abs(warp_params->beta) >=
+       (1 << kWarpedModelPrecisionBits)) ||
+      (4 * std::abs(warp_params->gamma) + 4 * std::abs(warp_params->delta) >=
+       (1 << kWarpedModelPrecisionBits))) {
+    return false;  // NOLINT (easier condition to understand).
+  }
+
+  return true;
+}
+
+bool WarpEstimation(const int num_samples, const int block_width4x4,
+                    const int block_height4x4, const int row4x4,
+                    const int column4x4, const MotionVector& mv,
+                    const int candidates[kMaxLeastSquaresSamples][4],
+                    GlobalMotion* const warp_params) {
+  // |a| fits into int32_t. To avoid cast to int64_t in the following
+  // computation, we declare |a| as int64_t.
+  int64_t a[2][2] = {};
+  int bx[2] = {};
+  int by[2] = {};
+
+  // Note: for simplicity, the spec always uses absolute coordinates
+  // in the warp estimation process. subpixel_mid_x, subpixel_mid_y,
+  // and candidates are relative to the top left of the frame.
+  // In contrast, libaom uses a mixture of coordinate systems.
+  // In av1/common/warped_motion.c:find_affine_int(). The coordinate is relative
+  // to the top left of the block.
+  // mid_y/mid_x: the row/column coordinate of the center of the block.
+  const int mid_y = MultiplyBy4(row4x4) + MultiplyBy2(block_height4x4) - 1;
+  const int mid_x = MultiplyBy4(column4x4) + MultiplyBy2(block_width4x4) - 1;
+  const int subpixel_mid_y = MultiplyBy8(mid_y);
+  const int subpixel_mid_x = MultiplyBy8(mid_x);
+  const int reference_subpixel_mid_y = subpixel_mid_y + mv.mv[0];
+  const int reference_subpixel_mid_x = subpixel_mid_x + mv.mv[1];
+
+  for (int i = 0; i < num_samples; ++i) {
+    // candidates[][0] and candidates[][1] are the row/column coordinates of the
+    // sample point in this block, to the top left of the frame.
+    // candidates[][2] and candidates[][3] are the row/column coordinates of the
+    // sample point in this reference block, to the top left of the frame.
+    // sy/sx: the row/column coordinates of the sample point, with center of
+    // the block as origin.
+    const int sy = candidates[i][0] - subpixel_mid_y;
+    const int sx = candidates[i][1] - subpixel_mid_x;
+    // dy/dx: the row/column coordinates of the sample point in the reference
+    // block, with center of the reference block as origin.
+    const int dy = candidates[i][2] - reference_subpixel_mid_y;
+    const int dx = candidates[i][3] - reference_subpixel_mid_x;
+    if (std::abs(sx - dx) < kLargestMotionVectorDiff &&
+        std::abs(sy - dy) < kLargestMotionVectorDiff) {
+      a[0][0] += LeastSquareProduct(sx, sx) + 8;
+      a[0][1] += LeastSquareProduct(sx, sy) + 4;
+      a[1][1] += LeastSquareProduct(sy, sy) + 8;
+      bx[0] += LeastSquareProduct(sx, dx) + 8;
+      bx[1] += LeastSquareProduct(sy, dx) + 4;
+      by[0] += LeastSquareProduct(sx, dy) + 4;
+      by[1] += LeastSquareProduct(sy, dy) + 8;
+    }
+  }
+
+  // a[0][1] == a[1][0], because the matrix is symmetric. We don't have to
+  // compute a[1][0].
+  const int64_t determinant = a[0][0] * a[1][1] - a[0][1] * a[0][1];
+  if (determinant == 0) return false;
+
+  int16_t division_shift;
+  int16_t division_factor;
+  GenerateApproximateDivisor<int64_t>(determinant, &division_factor,
+                                      &division_shift);
+
+  division_shift -= kWarpedModelPrecisionBits;
+
+  const int64_t params_2 = a[1][1] * bx[0] - a[0][1] * bx[1];
+  const int64_t params_3 = -a[0][1] * bx[0] + a[0][0] * bx[1];
+  const int64_t params_4 = a[1][1] * by[0] - a[0][1] * by[1];
+  const int64_t params_5 = -a[0][1] * by[0] + a[0][0] * by[1];
+  auto* const params = warp_params->params;
+
+  if (division_shift <= 0) {
+    division_factor <<= -division_shift;
+    params[2] = static_cast<int32_t>(params_2) * division_factor;
+    params[3] = static_cast<int32_t>(params_3) * division_factor;
+    params[4] = static_cast<int32_t>(params_4) * division_factor;
+    params[5] = static_cast<int32_t>(params_5) * division_factor;
+  } else {
+    params[2] = RightShiftWithRoundingSigned(params_2 * division_factor,
+                                             division_shift);
+    params[3] = RightShiftWithRoundingSigned(params_3 * division_factor,
+                                             division_shift);
+    params[4] = RightShiftWithRoundingSigned(params_4 * division_factor,
+                                             division_shift);
+    params[5] = RightShiftWithRoundingSigned(params_5 * division_factor,
+                                             division_shift);
+  }
+
+  params[2] = DiagonalClamp(params[2]);
+  params[3] = NonDiagonalClamp(params[3]);
+  params[4] = NonDiagonalClamp(params[4]);
+  params[5] = DiagonalClamp(params[5]);
+
+  const int vx = mv.mv[1] * (1 << (kWarpedModelPrecisionBits - 3)) -
+                 (mid_x * (params[2] - (1 << kWarpedModelPrecisionBits)) +
+                  mid_y * params[3]);
+  const int vy = mv.mv[0] * (1 << (kWarpedModelPrecisionBits - 3)) -
+                 (mid_x * params[4] +
+                  mid_y * (params[5] - (1 << kWarpedModelPrecisionBits)));
+  params[0] =
+      Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+  params[1] =
+      Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/warp_prediction.h b/src/warp_prediction.h
new file mode 100644 (file)
index 0000000..6c86df3
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_WARP_PREDICTION_H_
+#define LIBGAV1_SRC_WARP_PREDICTION_H_
+
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Sets the alpha, beta, gamma, delta fields in warp_params using the
+// warp_params->params array as input (only array entries at indexes 2, 3, 4,
+// 5 are used). Returns whether alpha, beta, gamma, delta are valid.
+bool SetupShear(GlobalMotion* warp_params);  // 7.11.3.6.
+
+// Computes local warp parameters by performing a least square fit.
+// Returns whether the computed parameters are valid.
+bool WarpEstimation(int num_samples, int block_width4x4, int block_height4x4,
+                    int row4x4, int column4x4, const MotionVector& mv,
+                    const int candidates[kMaxLeastSquaresSamples][4],
+                    GlobalMotion* warp_params);  // 7.11.3.8.
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_WARP_PREDICTION_H_
diff --git a/src/warp_prediction_test.cc b/src/warp_prediction_test.cc
new file mode 100644 (file)
index 0000000..46f262f
--- /dev/null
@@ -0,0 +1,246 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "src/obu_parser.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int16_t kExpectedWarpParamsOutput[10][4] = {
+    {0, 0, 0, 0},
+    {2880, 2880, 2752, 2752},
+    {-1408, -1408, -1472, -1472},
+    {0, 0, 0, 0},
+    {6784, 6784, 6144, 6144},  // Invalid.
+    {-5312, -5312, -5824, -5824},
+    {-3904, -3904, -4160, -4160},
+    {2496, 2496, 2368, 2368},
+    {1024, 1024, 1024, 1024},
+    {-7808, -7808, -8832, -8832},  // Invalid.
+};
+
+constexpr bool kExpectedWarpValid[10] = {
+    true, true, true, true, false, true, true, true, true, false,
+};
+
+int RandomWarpedParam(int seed_offset, int bits) {
+  libvpx_test::ACMRandom rnd(seed_offset +
+                             libvpx_test::ACMRandom::DeterministicSeed());
+  // 1 in 8 chance of generating zero (arbitrary).
+  const bool zero = (rnd.Rand16() & 7) == 0;
+  if (zero) return 0;
+  // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 << bits].
+  const int mask = (1 << bits) - 1;
+  const int value = 1 + (rnd.RandRange(1U << 31) & mask);
+  const bool sign = (rnd.Rand16() & 1) != 0;
+  return sign ? value : -value;
+}
+
+void GenerateWarpedModel(GlobalMotion* warp_params, int seed) {
+  do {
+    warp_params->params[0] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    warp_params->params[1] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    warp_params->params[2] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+        (1 << kWarpedModelPrecisionBits);
+    warp_params->params[3] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    warp_params->params[4] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    warp_params->params[5] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+        (1 << kWarpedModelPrecisionBits);
+  } while (warp_params->params[2] == 0);
+}
+
+TEST(WarpPredictionTest, SetupShear) {
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(kExpectedWarpParamsOutput); ++i) {
+    GlobalMotion warp_params;
+    GenerateWarpedModel(&warp_params, static_cast<int>(i));
+    const bool warp_valid = SetupShear(&warp_params);
+
+    SCOPED_TRACE(testing::Message() << "Test failure at iteration: " << i);
+    EXPECT_EQ(warp_valid, kExpectedWarpValid[i]);
+    EXPECT_EQ(warp_params.alpha, kExpectedWarpParamsOutput[i][0]);
+    EXPECT_EQ(warp_params.beta, kExpectedWarpParamsOutput[i][1]);
+    EXPECT_EQ(warp_params.gamma, kExpectedWarpParamsOutput[i][2]);
+    EXPECT_EQ(warp_params.delta, kExpectedWarpParamsOutput[i][3]);
+  }
+
+  // Test signed shift behavior in delta and gamma generation.
+  GlobalMotion warp_params;
+  warp_params.params[0] = 24748;
+  warp_params.params[1] = -142530;
+  warp_params.params[2] = 65516;
+  warp_params.params[3] = -640;
+  warp_params.params[4] = 256;
+  warp_params.params[5] = 65310;
+  EXPECT_TRUE(SetupShear(&warp_params));
+  EXPECT_EQ(warp_params.alpha, 0);
+  EXPECT_EQ(warp_params.beta, -640);
+  EXPECT_EQ(warp_params.gamma, 256);
+  EXPECT_EQ(warp_params.delta, -192);
+
+  warp_params.params[0] = 24748;
+  warp_params.params[1] = -142530;
+  warp_params.params[2] = 61760;
+  warp_params.params[3] = -640;
+  warp_params.params[4] = -13312;
+  warp_params.params[5] = 65310;
+  EXPECT_TRUE(SetupShear(&warp_params));
+  EXPECT_EQ(warp_params.alpha, -3776);
+  EXPECT_EQ(warp_params.beta, -640);
+  EXPECT_EQ(warp_params.gamma, -14144);
+  EXPECT_EQ(warp_params.delta, -384);
+}
+
+struct WarpInputParam {
+  WarpInputParam(int num_samples, int block_width4x4, int block_height4x4)
+      : num_samples(num_samples),
+        block_width4x4(block_width4x4),
+        block_height4x4(block_height4x4) {}
+  int num_samples;
+  int block_width4x4;
+  int block_height4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const WarpInputParam& param) {
+  return os << "num_samples: " << param.num_samples
+            << ", block_(width/height)4x4: " << param.block_width4x4 << "x"
+            << param.block_height4x4;
+}
+
+const WarpInputParam warp_test_param[] = {
+    // sample = 1.
+    WarpInputParam(1, 1, 1),
+    WarpInputParam(1, 1, 2),
+    WarpInputParam(1, 2, 1),
+    WarpInputParam(1, 2, 2),
+    WarpInputParam(1, 2, 4),
+    WarpInputParam(1, 4, 2),
+    WarpInputParam(1, 4, 4),
+    WarpInputParam(1, 4, 8),
+    WarpInputParam(1, 8, 4),
+    WarpInputParam(1, 8, 8),
+    WarpInputParam(1, 8, 16),
+    WarpInputParam(1, 16, 8),
+    WarpInputParam(1, 16, 16),
+    WarpInputParam(1, 16, 32),
+    WarpInputParam(1, 32, 16),
+    WarpInputParam(1, 32, 32),
+    // sample = 8.
+    WarpInputParam(8, 1, 1),
+    WarpInputParam(8, 1, 2),
+    WarpInputParam(8, 2, 1),
+    WarpInputParam(8, 2, 2),
+    WarpInputParam(8, 2, 4),
+    WarpInputParam(8, 4, 2),
+    WarpInputParam(8, 4, 4),
+    WarpInputParam(8, 4, 8),
+    WarpInputParam(8, 8, 4),
+    WarpInputParam(8, 8, 8),
+    WarpInputParam(8, 8, 16),
+    WarpInputParam(8, 16, 8),
+    WarpInputParam(8, 16, 16),
+    WarpInputParam(8, 16, 32),
+    WarpInputParam(8, 32, 16),
+    WarpInputParam(8, 32, 32),
+};
+
+constexpr bool kExpectedWarpEstimationValid[2] = {false, true};
+
+constexpr int kExpectedWarpEstimationOutput[16][6] = {
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {2146296, 1589240, 57345, 8191, -8191, 73727},
+    {1753128, 1196072, 73727, -8191, 8191, 57345},
+    {-8388608, -8388608, 73727, 8191, 8191, 73727},
+    {-4435485, -8388608, 65260, 8191, 8191, 73727},
+    {-8388608, -7552929, 73727, 8191, 8191, 68240},
+    {-8388608, -8388608, 73727, 8191, 8191, 70800},
+};
+
+class WarpEstimationTest : public testing::TestWithParam<WarpInputParam> {
+ public:
+  WarpEstimationTest() = default;
+  ~WarpEstimationTest() override = default;
+
+ protected:
+  WarpInputParam param_ = GetParam();
+};
+
+TEST_P(WarpEstimationTest, WarpEstimation) {
+  // Set input params.
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int row4x4 = rnd.Rand8();
+  const int column4x4 = rnd.Rand8();
+  MotionVector mv;
+  mv.mv[0] = rnd.Rand8();
+  mv.mv[1] = rnd.Rand8();
+  int candidates[kMaxLeastSquaresSamples][4];
+  for (int i = 0; i < param_.num_samples; ++i) {
+    // Make candidates relative to the top left of frame.
+    candidates[i][0] = rnd.Rand8() + MultiplyBy32(row4x4);
+    candidates[i][1] = rnd.Rand8() + MultiplyBy32(column4x4);
+    candidates[i][2] = rnd.Rand8() + MultiplyBy32(row4x4);
+    candidates[i][3] = rnd.Rand8() + MultiplyBy32(column4x4);
+  }
+
+  // Get output.
+  GlobalMotion warp_params;
+  const bool warp_success = WarpEstimation(
+      param_.num_samples, param_.block_width4x4, param_.block_height4x4, row4x4,
+      column4x4, mv, candidates, &warp_params);
+  if (param_.num_samples == 1) {
+    EXPECT_EQ(warp_success, kExpectedWarpEstimationValid[0]);
+  } else {
+    EXPECT_EQ(warp_success, kExpectedWarpEstimationValid[1]);
+    int index = FloorLog2(param_.block_width4x4) * 3 - 1;
+    if (param_.block_width4x4 == param_.block_height4x4) {
+      index += 1;
+    } else if (param_.block_width4x4 < param_.block_height4x4) {
+      index += 2;
+    }
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(warp_params.params); ++i) {
+      EXPECT_EQ(warp_params.params[i], kExpectedWarpEstimationOutput[index][i]);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(WarpFuncTest, WarpEstimationTest,
+                         testing::ValuesIn(warp_test_param));
+}  // namespace
+}  // namespace libgav1
diff --git a/src/yuv_buffer.cc b/src/yuv_buffer.cc
new file mode 100644 (file)
index 0000000..85619c3
--- /dev/null
@@ -0,0 +1,267 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/yuv_buffer.h"
+
+#include <cassert>
+#include <cstddef>
+#include <new>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+// Size conventions:
+// * Widths, heights, and border sizes are in pixels.
+// * Strides and plane sizes are in bytes.
+//
+// YuvBuffer objects may be reused through the BufferPool. Realloc() must
+// assume that data members (except buffer_alloc_ and buffer_alloc_size_) may
+// contain stale values from the previous use, and must set all data members
+// from scratch. In particular, Realloc() must not rely on the initial values
+// of data members set by the YuvBuffer constructor.
+bool YuvBuffer::Realloc(int bitdepth, bool is_monochrome, int width, int height,
+                        int8_t subsampling_x, int8_t subsampling_y,
+                        int left_border, int right_border, int top_border,
+                        int bottom_border,
+                        GetFrameBufferCallback get_frame_buffer,
+                        void* callback_private_data,
+                        void** buffer_private_data) {
+  // Only support allocating buffers that have borders that are a multiple of
+  // 2. The border restriction is required because we may subsample the
+  // borders in the chroma planes.
+  if (((left_border | right_border | top_border | bottom_border) & 1) != 0) {
+    LIBGAV1_DLOG(ERROR,
+                 "Borders must be a multiple of 2: left_border = %d, "
+                 "right_border = %d, top_border = %d, bottom_border = %d.",
+                 left_border, right_border, top_border, bottom_border);
+    return false;
+  }
+
+  // Every row in the plane buffers needs to be kFrameBufferRowAlignment-byte
+  // aligned. Since the strides are multiples of kFrameBufferRowAlignment bytes,
+  // it suffices to just make the plane buffers kFrameBufferRowAlignment-byte
+  // aligned.
+  const int plane_align = kFrameBufferRowAlignment;
+  const int uv_width =
+      is_monochrome ? 0 : SubsampledValue(width, subsampling_x);
+  const int uv_height =
+      is_monochrome ? 0 : SubsampledValue(height, subsampling_y);
+  const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+  const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+  const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+  const int uv_bottom_border =
+      is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+  if (get_frame_buffer != nullptr) {
+    assert(buffer_private_data != nullptr);
+
+    const Libgav1ImageFormat image_format =
+        ComposeImageFormat(is_monochrome, subsampling_x, subsampling_y);
+    FrameBuffer frame_buffer;
+    if (get_frame_buffer(callback_private_data, bitdepth, image_format, width,
+                         height, left_border, right_border, top_border,
+                         bottom_border, kFrameBufferRowAlignment,
+                         &frame_buffer) != kStatusOk) {
+      return false;
+    }
+
+    if (frame_buffer.plane[0] == nullptr ||
+        (!is_monochrome && frame_buffer.plane[1] == nullptr) ||
+        (!is_monochrome && frame_buffer.plane[2] == nullptr)) {
+      assert(false && "The get_frame_buffer callback malfunctioned.");
+      LIBGAV1_DLOG(ERROR, "The get_frame_buffer callback malfunctioned.");
+      return false;
+    }
+
+    stride_[kPlaneY] = frame_buffer.stride[0];
+    stride_[kPlaneU] = frame_buffer.stride[1];
+    stride_[kPlaneV] = frame_buffer.stride[2];
+    buffer_[kPlaneY] = frame_buffer.plane[0];
+    buffer_[kPlaneU] = frame_buffer.plane[1];
+    buffer_[kPlaneV] = frame_buffer.plane[2];
+    *buffer_private_data = frame_buffer.private_data;
+  } else {
+    assert(callback_private_data == nullptr);
+    assert(buffer_private_data == nullptr);
+
+    // Calculate y_stride (in bytes). It is padded to a multiple of
+    // kFrameBufferRowAlignment bytes.
+    int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+    y_stride = Align(y_stride, kFrameBufferRowAlignment);
+    // Size of the Y plane in bytes.
+    const uint64_t y_plane_size = (height + top_border + bottom_border) *
+                                      static_cast<uint64_t>(y_stride) +
+                                  (plane_align - 1);
+
+    // Calculate uv_stride (in bytes). It is padded to a multiple of
+    // kFrameBufferRowAlignment bytes.
+    int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+    uv_stride = Align(uv_stride, kFrameBufferRowAlignment);
+    // Size of the U or V plane in bytes.
+    const uint64_t uv_plane_size =
+        is_monochrome ? 0
+                      : (uv_height + uv_top_border + uv_bottom_border) *
+                                static_cast<uint64_t>(uv_stride) +
+                            (plane_align - 1);
+
+    // Allocate unaligned y_buffer, u_buffer, and v_buffer.
+    uint8_t* y_buffer = nullptr;
+    uint8_t* u_buffer = nullptr;
+    uint8_t* v_buffer = nullptr;
+
+    const uint64_t frame_size = y_plane_size + 2 * uv_plane_size;
+    if (frame_size > buffer_alloc_size_) {
+      // Allocation to hold larger frame, or first allocation.
+      if (frame_size != static_cast<size_t>(frame_size)) return false;
+
+      buffer_alloc_.reset(new (std::nothrow)
+                              uint8_t[static_cast<size_t>(frame_size)]);
+      if (buffer_alloc_ == nullptr) {
+        buffer_alloc_size_ = 0;
+        return false;
+      }
+
+      buffer_alloc_size_ = static_cast<size_t>(frame_size);
+    }
+
+    y_buffer = buffer_alloc_.get();
+    if (!is_monochrome) {
+      u_buffer = y_buffer + y_plane_size;
+      v_buffer = u_buffer + uv_plane_size;
+    }
+
+    stride_[kPlaneY] = y_stride;
+    stride_[kPlaneU] = stride_[kPlaneV] = uv_stride;
+
+    int left_border_bytes = left_border;
+    int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) {
+      left_border_bytes *= sizeof(uint16_t);
+      uv_left_border_bytes *= sizeof(uint16_t);
+    }
+#endif
+    buffer_[kPlaneY] = AlignAddr(
+        y_buffer + (top_border * y_stride) + left_border_bytes, plane_align);
+    buffer_[kPlaneU] =
+        AlignAddr(u_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+                  plane_align);
+    buffer_[kPlaneV] =
+        AlignAddr(v_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+                  plane_align);
+  }
+
+  y_width_ = width;
+  y_height_ = height;
+  left_border_[kPlaneY] = left_border;
+  right_border_[kPlaneY] = right_border;
+  top_border_[kPlaneY] = top_border;
+  bottom_border_[kPlaneY] = bottom_border;
+
+  uv_width_ = uv_width;
+  uv_height_ = uv_height;
+  left_border_[kPlaneU] = left_border_[kPlaneV] = uv_left_border;
+  right_border_[kPlaneU] = right_border_[kPlaneV] = uv_right_border;
+  top_border_[kPlaneU] = top_border_[kPlaneV] = uv_top_border;
+  bottom_border_[kPlaneU] = bottom_border_[kPlaneV] = uv_bottom_border;
+
+  subsampling_x_ = subsampling_x;
+  subsampling_y_ = subsampling_y;
+
+  bitdepth_ = bitdepth;
+  is_monochrome_ = is_monochrome;
+  assert(!is_monochrome || stride_[kPlaneU] == 0);
+  assert(!is_monochrome || stride_[kPlaneV] == 0);
+  assert(!is_monochrome || buffer_[kPlaneU] == nullptr);
+  assert(!is_monochrome || buffer_[kPlaneV] == nullptr);
+
+#if LIBGAV1_MSAN
+  InitializeFrameBorders();
+#endif
+
+  return true;
+}
+
+#if LIBGAV1_MSAN
+void YuvBuffer::InitializeFrameBorders() {
+  const int pixel_size = (bitdepth_ == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  const int y_width_in_bytes = y_width_ * pixel_size;
+  // The optimized loop restoration code will overread the visible frame buffer
+  // into the right border. The optimized cfl subsambler uses the right border
+  // as well. Initialize the right border and padding to prevent msan warnings.
+  const int y_right_border_size_in_bytes = right_border_[kPlaneY] * pixel_size;
+  // Calculate the padding bytes for the buffer. Note: The stride of the buffer
+  // is always a multiple of 16. (see yuv_buffer.h)
+  const int y_right_padding_in_bytes =
+      stride_[kPlaneY] - (pixel_size * (y_width_ + left_border_[kPlaneY] +
+                                        right_border_[kPlaneY]));
+  const int y_padded_right_border_size =
+      y_right_border_size_in_bytes + y_right_padding_in_bytes;
+  constexpr uint8_t kRightValue = 0x55;
+  uint8_t* rb = buffer_[kPlaneY] + y_width_in_bytes;
+  for (int i = 0; i < y_height_ + bottom_border_[kPlaneY]; ++i) {
+    memset(rb, kRightValue, y_padded_right_border_size);
+    rb += stride_[kPlaneY];
+  }
+
+  if (!is_monochrome_) {
+    const int uv_width_in_bytes = uv_width_ * pixel_size;
+    const int uv_right_border_size_in_bytes =
+        right_border_[kPlaneU] * pixel_size;
+    assert(right_border_[kPlaneU] == right_border_[kPlaneV]);
+    const int u_right_padding_in_bytes =
+        stride_[kPlaneU] - (pixel_size * (uv_width_ + left_border_[kPlaneU] +
+                                          right_border_[kPlaneU]));
+    const int u_padded_right_border_size =
+        uv_right_border_size_in_bytes + u_right_padding_in_bytes;
+    rb = buffer_[kPlaneU] + uv_width_in_bytes;
+    for (int i = 0; i < uv_height_; ++i) {
+      memset(rb, kRightValue, u_padded_right_border_size);
+      rb += stride_[kPlaneU];
+    }
+    const int v_right_padding_in_bytes =
+        stride_[kPlaneV] -
+        ((uv_width_ + left_border_[kPlaneV] + right_border_[kPlaneV]) *
+         pixel_size);
+    const int v_padded_right_border_size =
+        uv_right_border_size_in_bytes + v_right_padding_in_bytes;
+    rb = buffer_[kPlaneV] + uv_width_in_bytes;
+    for (int i = 0; i < uv_height_; ++i) {
+      memset(rb, kRightValue, v_padded_right_border_size);
+      rb += stride_[kPlaneV];
+    }
+  }
+
+  // The optimized cfl subsampler will overread (to the right of the current
+  // block) into the uninitialized visible area. The cfl subsampler can overread
+  // into the bottom border as well. Initialize the both to quiet msan warnings.
+  uint8_t* y_visible = buffer_[kPlaneY];
+  for (int i = 0; i < y_height_ + bottom_border_[kPlaneY]; ++i) {
+    memset(y_visible, kRightValue, y_width_in_bytes);
+    y_visible += stride_[kPlaneY];
+  }
+}
+#endif  // LIBGAV1_MSAN
+
+}  // namespace libgav1
diff --git a/src/yuv_buffer.h b/src/yuv_buffer.h
new file mode 100644 (file)
index 0000000..d7818bd
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_YUV_BUFFER_H_
+#define LIBGAV1_SRC_YUV_BUFFER_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+class YuvBuffer {
+ public:
+  // Allocates the buffer. Returns true on success. Returns false on failure.
+  //
+  // * |width| and |height| are the image dimensions in pixels.
+  // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+  //   subsampling of the width and height of the chroma planes, respectively.
+  // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+  //   the sizes (in pixels) of the borders on the left, right, top, and
+  //   bottom sides, respectively. The four border sizes must all be a
+  //   multiple of 2.
+  // * If |get_frame_buffer| is not null, it is invoked to allocate the memory.
+  //   If |get_frame_buffer| is null, YuvBuffer allocates the memory directly
+  //   and ignores the |callback_private_data| and |buffer_private_data|
+  //   parameters, which should be null.
+  //
+  // NOTE: The strides are a multiple of 16. Since the first row in each plane
+  // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+  //
+  // Example: bitdepth=8 width=20 height=6 left/right/top/bottom_border=2. The
+  // diagram below shows how Realloc() allocates the data buffer for the Y
+  // plane.
+  //
+  //   16-byte aligned
+  //          |
+  //          v
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++01234567890123456789++pppppppp
+  //        ++11234567890123456789++pppppppp
+  //        ++21234567890123456789++pppppppp
+  //        ++31234567890123456789++pppppppp
+  //        ++41234567890123456789++pppppppp
+  //        ++51234567890123456789++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        |                              |
+  //        |<-- stride (multiple of 16) ->|
+  //
+  // The video frame has 6 rows of 20 pixels each. Each row is shown as the
+  // pattern r1234567890123456789, where |r| is 0, 1, 2, 3, 4, 5.
+  //
+  // Realloc() first adds a border of 2 pixels around the video frame. The
+  // border pixels are shown as '+'.
+  //
+  // Each row is then padded to a multiple of the default alignment in bytes,
+  // which is 16. The padding bytes are shown as lowercase 'p'. (Since
+  // |bitdepth| is 8 in this example, each pixel is one byte.) The padded size
+  // in bytes is the stride. In this example, the stride is 32 bytes.
+  //
+  // Finally, Realloc() aligns the first byte of frame data, which is the '0'
+  // pixel/byte in the upper left corner of the frame, to the default (16-byte)
+  // alignment boundary.
+  //
+  // TODO(wtc): Add a check for width and height limits to defend against
+  // invalid bitstreams.
+  bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+               int8_t subsampling_x, int8_t subsampling_y, int left_border,
+               int right_border, int top_border, int bottom_border,
+               GetFrameBufferCallback get_frame_buffer,
+               void* callback_private_data, void** buffer_private_data);
+
+  int bitdepth() const { return bitdepth_; }
+
+  bool is_monochrome() const { return is_monochrome_; }
+
+  int8_t subsampling_x() const { return subsampling_x_; }
+  int8_t subsampling_y() const { return subsampling_y_; }
+
+  int width(int plane) const {
+    return (plane == kPlaneY) ? y_width_ : uv_width_;
+  }
+  int height(int plane) const {
+    return (plane == kPlaneY) ? y_height_ : uv_height_;
+  }
+
+  // Returns border sizes in pixels.
+  int left_border(int plane) const { return left_border_[plane]; }
+  int right_border(int plane) const { return right_border_[plane]; }
+  int top_border(int plane) const { return top_border_[plane]; }
+  int bottom_border(int plane) const { return bottom_border_[plane]; }
+
+  // Returns the alignment of frame buffer row in bytes.
+  int alignment() const { return kFrameBufferRowAlignment; }
+
+  // Backup the current set of warnings and disable -Warray-bounds for the
+  // following three functions as the compiler cannot, in all cases, determine
+  // whether |plane| is within [0, kMaxPlanes), e.g., with a variable based for
+  // loop.
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+  // Returns the data buffer for |plane|.
+  uint8_t* data(int plane) {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+    return buffer_[plane];
+  }
+  const uint8_t* data(int plane) const {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+    return buffer_[plane];
+  }
+
+  // Returns the stride in bytes for |plane|.
+  int stride(int plane) const {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(stride_)>::value);
+    return stride_[plane];
+  }
+  // Restore the previous set of compiler warnings.
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+ private:
+  static constexpr int kFrameBufferRowAlignment = 16;
+
+#if LIBGAV1_MSAN
+  void InitializeFrameBorders();
+#endif
+
+  int bitdepth_ = 0;
+  bool is_monochrome_ = false;
+
+  // y_width_ and y_height_ are the |width| and |height| arguments passed to the
+  // Realloc() method.
+  //
+  // uv_width_ and uv_height_ are computed from y_width_ and y_height_ as
+  // follows:
+  //   uv_width_ = (y_width_ + subsampling_x_) >> subsampling_x_
+  //   uv_height_ = (y_height_ + subsampling_y_) >> subsampling_y_
+  int y_width_ = 0;
+  int uv_width_ = 0;
+  int y_height_ = 0;
+  int uv_height_ = 0;
+
+  int left_border_[kMaxPlanes] = {};
+  int right_border_[kMaxPlanes] = {};
+  int top_border_[kMaxPlanes] = {};
+  int bottom_border_[kMaxPlanes] = {};
+
+  int stride_[kMaxPlanes] = {};
+  uint8_t* buffer_[kMaxPlanes] = {};
+
+  // buffer_alloc_ and buffer_alloc_size_ are only used if the
+  // get_frame_buffer callback is null and we allocate the buffer ourselves.
+  std::unique_ptr<uint8_t[]> buffer_alloc_;
+  size_t buffer_alloc_size_ = 0;
+
+  int8_t subsampling_x_ = 0;  // 0 or 1.
+  int8_t subsampling_y_ = 0;  // 0 or 1.
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_YUV_BUFFER_H_
diff --git a/tests/block_utils.cc b/tests/block_utils.cc
new file mode 100644 (file)
index 0000000..a68ae64
--- /dev/null
@@ -0,0 +1,134 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/block_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+#define LIBGAV1_DEBUG_FORMAT_CODE "x"
+template <typename Pixel>
+void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width,
+                    int height, int stride1, int stride2,
+                    const bool print_padding) {
+  const int print_width = print_padding ? std::min(stride1, stride2) : width;
+  const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+
+  for (int y = 0; y < height; ++y) {
+    printf("[%2d] ", y);
+    for (int x = 0; x < print_width; ++x) {
+      if (x >= width) {
+        if (block1[x] == block2[x]) {
+          printf("[%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width, block1[x]);
+        } else {
+          printf("[*%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width - 1,
+                 block1[x]);
+        }
+      } else {
+        if (block1[x] == block2[x]) {
+          printf("%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width, block1[x]);
+        } else {
+          printf("*%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width - 1,
+                 block1[x]);
+        }
+      }
+    }
+    printf("\n");
+    block1 += stride1;
+    block2 += stride2;
+  }
+}
+
+}  // namespace
+
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+                const bool print_padding /*= false*/) {
+  const int print_width = print_padding ? stride : width;
+  const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+  for (int y = 0; y < height; ++y) {
+    printf("[%2d] ", y);
+    for (int x = 0; x < print_width; ++x) {
+      if (x >= width) {
+        printf("[%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width, block[x]);
+      } else {
+        printf("%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width, block[x]);
+      }
+    }
+    printf("\n");
+    block += stride;
+  }
+}
+#undef LIBGAV1_DEBUG_FORMAT_CODE
+
+template void PrintBlock(const uint8_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+template void PrintBlock(const uint16_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+template void PrintBlock(const int8_t* block, int width, int height, int stride,
+                         bool print_padding /*= false*/);
+template void PrintBlock(const int16_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+                   int height, int stride1, int stride2,
+                   const bool check_padding, const bool print_diff /*= true*/) {
+  bool ok = true;
+  const int check_width = check_padding ? std::min(stride1, stride2) : width;
+  for (int y = 0; y < height; ++y) {
+    const uint64_t row1 = static_cast<uint64_t>(y) * stride1;
+    const uint64_t row2 = static_cast<uint64_t>(y) * stride2;
+    ok = memcmp(block1 + row1, block2 + row2,
+                sizeof(block1[0]) * check_width) == 0;
+    if (!ok) break;
+  }
+  if (!ok && print_diff) {
+    printf("block1 (width: %d height: %d stride: %d):\n", width, height,
+           stride1);
+    PrintBlockDiff(block1, block2, width, height, stride1, stride2,
+                   check_padding);
+    printf("\nblock2 (width: %d height: %d stride: %d):\n", width, height,
+           stride2);
+    PrintBlockDiff(block2, block1, width, height, stride2, stride1,
+                   check_padding);
+  }
+  return ok;
+}
+
+template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const uint16_t* block1, const uint16_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const int8_t* block1, const int8_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const int16_t* block1, const int16_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+
+}  // namespace test_utils
+}  // namespace libgav1
diff --git a/tests/block_utils.h b/tests/block_utils.h
new file mode 100644 (file)
index 0000000..4542420
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_BLOCK_UTILS_H_
+#define LIBGAV1_TESTS_BLOCK_UTILS_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+namespace test_utils {
+
+//------------------------------------------------------------------------------
+// Prints |block| pixel by pixel with |width| pixels per row if |print_padding|
+// is false, |stride| otherwise. If |print_padding| is true padding pixels are
+// surrounded in '[]'.
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+                bool print_padding = false);
+
+extern template void PrintBlock(const uint8_t* block, int width, int height,
+                                int stride, bool print_padding /*= false*/);
+extern template void PrintBlock(const uint16_t* block, int width, int height,
+                                int stride, bool print_padding /*= false*/);
+
+//------------------------------------------------------------------------------
+// Compares |block1| and |block2| pixel by pixel checking |width| pixels per row
+// if |check_padding| is false, min(|stride1|, |stride2|) pixels otherwise.
+// Prints the blocks with differences marked with a '*' if |print_diff| is
+// true (the default).
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+                   int height, int stride1, int stride2, bool check_padding,
+                   bool print_diff = true);
+
+extern template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+                                   int width, int height, int stride1,
+                                   int stride2, bool check_padding,
+                                   bool print_diff /*= true*/);
+extern template bool CompareBlocks(const uint16_t* block1,
+                                   const uint16_t* block2, int width,
+                                   int height, int stride1, int stride2,
+                                   bool check_padding,
+                                   bool print_diff /*= true*/);
+
+}  // namespace test_utils
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TESTS_BLOCK_UTILS_H_
diff --git a/tests/data/five-frames.ivf b/tests/data/five-frames.ivf
new file mode 100644 (file)
index 0000000..08bc6db
Binary files /dev/null and b/tests/data/five-frames.ivf differ
diff --git a/tests/data/ivf-header-and-truncated-frame-header b/tests/data/ivf-header-and-truncated-frame-header
new file mode 100644 (file)
index 0000000..c6d7a6a
Binary files /dev/null and b/tests/data/ivf-header-and-truncated-frame-header differ
diff --git a/tests/data/ivf-header-only b/tests/data/ivf-header-only
new file mode 100644 (file)
index 0000000..e751f36
Binary files /dev/null and b/tests/data/ivf-header-only differ
diff --git a/tests/data/ivf-signature-only b/tests/data/ivf-signature-only
new file mode 100644 (file)
index 0000000..8550ef8
--- /dev/null
@@ -0,0 +1 @@
+DKIF
diff --git a/tests/data/one-frame-large-timestamp.ivf b/tests/data/one-frame-large-timestamp.ivf
new file mode 100644 (file)
index 0000000..44886da
Binary files /dev/null and b/tests/data/one-frame-large-timestamp.ivf differ
diff --git a/tests/data/one-frame-truncated.ivf b/tests/data/one-frame-truncated.ivf
new file mode 100644 (file)
index 0000000..94e5b09
Binary files /dev/null and b/tests/data/one-frame-truncated.ivf differ
diff --git a/tests/data/one-frame.ivf b/tests/data/one-frame.ivf
new file mode 100644 (file)
index 0000000..436e461
Binary files /dev/null and b/tests/data/one-frame.ivf differ
diff --git a/tests/fuzzer/decoder_fuzzer.cc b/tests/fuzzer/decoder_fuzzer.cc
new file mode 100644 (file)
index 0000000..236fd3c
--- /dev/null
@@ -0,0 +1,87 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+void Decode(const uint8_t* const data, const size_t size,
+            libgav1::Decoder* const decoder) {
+  decoder->EnqueueFrame(data, size, /*user_private_data=*/0,
+                        /*buffer_private_data=*/nullptr);
+  const libgav1::DecoderBuffer* buffer;
+  decoder->DequeueFrame(&buffer);
+}
+
+}  // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings = {};
+  // Use the low byte of the width to seed the number of threads.
+  // We use both nibbles of the lower byte as this results in values != 1 much
+  // more quickly than using the lower nibble alone.
+  settings.threads = (size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1;
+  if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+  // Treat the input as a raw OBU stream.
+  Decode(data, size, &decoder);
+
+  // Use the first frame from an IVF to bypass any read errors from the parser.
+  static constexpr size_t kIvfHeaderSize =
+      libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+  if (size >= kIvfHeaderSize) {
+    Decode(data + kIvfHeaderSize, size - kIvfHeaderSize, &decoder);
+  }
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  std::vector<uint8_t> buffer;
+  int decoded_frames = 0;
+  do {
+    if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+    Decode(buffer.data(), buffer.size(), &decoder);
+    if (++decoded_frames >= kMaxFrames) break;
+  } while (!file_reader->IsEndOfFile());
+
+  return 0;
+}
diff --git a/tests/fuzzer/decoder_fuzzer_frame_parallel.cc b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
new file mode 100644 (file)
index 0000000..d1b1c54
--- /dev/null
@@ -0,0 +1,139 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "src/gav1/status_code.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+using InputBuffer = std::vector<uint8_t>;
+
+struct InputBuffers {
+  ~InputBuffers() {
+    for (auto& buffer : free_buffers) {
+      delete buffer;
+    }
+  }
+  std::deque<InputBuffer*> free_buffers;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+                        void* buffer_private_data) {
+  auto* const test = static_cast<InputBuffers*>(callback_private_data);
+  test->free_buffers.push_back(static_cast<InputBuffer*>(buffer_private_data));
+}
+
+}  // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  // Note that |input_buffers| has to outlive the |decoder| object since the
+  // |release_input_buffer| callback could be called on the |decoder|'s
+  // destructor.
+  InputBuffers input_buffers;
+
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings = {};
+  // Use the 33 + low byte of the width to seed the number of threads. This
+  // ensures that we will trigger the frame parallel path in most cases.
+  // We use both nibbles of the lower byte as this results in values != 1 much
+  // more quickly than using the lower nibble alone.
+  settings.threads =
+      33 + ((size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1);
+
+  settings.frame_parallel = true;
+  settings.blocking_dequeue = true;
+  settings.callback_private_data = &input_buffers;
+  settings.release_input_buffer = ReleaseInputBuffer;
+  if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  InputBuffer* input_buffer = nullptr;
+  bool dequeue_finished = false;
+
+  do {
+    if (input_buffer == nullptr && !file_reader->IsEndOfFile()) {
+      if (input_buffers.free_buffers.empty()) {
+        auto* const buffer = new (std::nothrow) InputBuffer();
+        if (buffer == nullptr) {
+          break;
+        }
+        input_buffers.free_buffers.push_back(buffer);
+      }
+      input_buffer = input_buffers.free_buffers.front();
+      input_buffers.free_buffers.pop_front();
+      if (!file_reader->ReadTemporalUnit(input_buffer, nullptr)) {
+        break;
+      }
+    }
+
+    if (input_buffer != nullptr) {
+      libgav1::StatusCode status =
+          decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+                               /*user_private_data=*/0,
+                               /*buffer_private_data=*/input_buffer);
+      if (status == libgav1::kStatusOk) {
+        input_buffer = nullptr;
+        // Continue to enqueue frames until we get a kStatusTryAgain status.
+        continue;
+      }
+      if (status != libgav1::kStatusTryAgain) {
+        break;
+      }
+    }
+
+    const libgav1::DecoderBuffer* buffer;
+    libgav1::StatusCode status = decoder.DequeueFrame(&buffer);
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+    } else if (status == libgav1::kStatusOk) {
+      dequeue_finished = false;
+    } else {
+      break;
+    }
+  } while (input_buffer != nullptr || !file_reader->IsEndOfFile() ||
+           !dequeue_finished);
+
+  if (input_buffer != nullptr) {
+    input_buffers.free_buffers.push_back(input_buffer);
+  }
+
+  return 0;
+}
diff --git a/tests/fuzzer/fuzzer_temp_file.h b/tests/fuzzer/fuzzer_temp_file.h
new file mode 100644 (file)
index 0000000..ed8f51c
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+#define LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+
+// Adapter utility from fuzzer input to a temporary file, for fuzzing APIs that
+// require a file instead of an input buffer.
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <io.h>
+#include <windows.h>
+
+#define strdup _strdup
+#define unlink _unlink
+#else
+#include <unistd.h>
+#endif  // _WIN32
+
+// Pure-C interface for creating and cleaning up temporary files.
+
+static char* fuzzer_get_tmpfile_with_suffix(const uint8_t* data, size_t size,
+                                            const char* suffix) {
+#ifdef _WIN32
+  // GetTempPathA generates '<path>\<pre><uuuu>.TMP'.
+  (void)suffix;  // NOLINT (this could be a C compilation unit)
+  char temp_path[MAX_PATH];
+  const DWORD ret = GetTempPathA(MAX_PATH, temp_path);
+  if (ret == 0 || ret > MAX_PATH) {
+    fprintf(stderr, "Error getting temporary directory name: %lu\n",
+            GetLastError());
+    abort();
+  }
+  char* filename_buffer =
+      (char*)malloc(MAX_PATH);  // NOLINT (this could be a C compilation unit)
+  if (!filename_buffer) {
+    perror("Failed to allocate file name buffer.");
+    abort();
+  }
+  if (GetTempFileNameA(temp_path, "ftf", /*uUnique=*/0, filename_buffer) == 0) {
+    fprintf(stderr, "Error getting temporary file name: %lu\n", GetLastError());
+    abort();
+  }
+#if defined(_MSC_VER) || defined(MINGW_HAS_SECURE_API)
+  FILE* file;
+  const errno_t err = fopen_s(&file, filename_buffer, "wb");
+  if (err != 0) file = NULL;  // NOLINT (this could be a C compilation unit)
+#else
+  FILE* file = fopen(filename_buffer, "wb");
+#endif
+  if (!file) {
+    perror("Failed to open file.");
+    abort();
+  }
+#else  // !_WIN32
+  if (suffix == NULL) {  // NOLINT (this could be a C compilation unit)
+    suffix = "";
+  }
+  const size_t suffix_len = strlen(suffix);
+  if (suffix_len > INT_MAX) {  // mkstemps takes int for suffixlen param
+    perror("Suffix too long");
+    abort();
+  }
+
+#ifdef __ANDROID__
+  const char* leading_temp_path =
+      "/data/local/tmp/generate_temporary_file.XXXXXX";
+#else
+  const char* leading_temp_path = "/tmp/generate_temporary_file.XXXXXX";
+#endif
+  const size_t buffer_sz = strlen(leading_temp_path) + suffix_len + 1;
+  char* filename_buffer =
+      (char*)malloc(buffer_sz);  // NOLINT (this could be a C compilation unit)
+  if (!filename_buffer) {
+    perror("Failed to allocate file name buffer.");
+    abort();
+  }
+
+  if (snprintf(filename_buffer, buffer_sz, "%s%s", leading_temp_path, suffix) >=
+      (int)buffer_sz) {  // NOLINT (this could be a C compilation unit)
+    perror("File name buffer too short.");
+    abort();
+  }
+
+  const int file_descriptor = mkstemps(filename_buffer, suffix_len);
+  if (file_descriptor < 0) {
+    perror("Failed to make temporary file.");
+    abort();
+  }
+  FILE* file = fdopen(file_descriptor, "wb");
+  if (!file) {
+    perror("Failed to open file descriptor.");
+    close(file_descriptor);
+    abort();
+  }
+#endif  // _WIN32
+  const size_t bytes_written = fwrite(data, sizeof(uint8_t), size, file);
+  if (bytes_written < size) {
+    fclose(file);
+    fprintf(stderr, "Failed to write all bytes to file (%zu out of %zu)",
+            bytes_written, size);
+    abort();
+  }
+  fclose(file);
+  return filename_buffer;
+}
+
+static char* fuzzer_get_tmpfile(
+    const uint8_t* data,
+    size_t size) {  // NOLINT (people include this .inc file directly)
+  return fuzzer_get_tmpfile_with_suffix(data, size, NULL);  // NOLINT
+}
+
+static void fuzzer_release_tmpfile(char* filename) {
+  if (unlink(filename) != 0) {
+    perror("WARNING: Failed to delete temporary file.");
+  }
+  free(filename);
+}
+
+// C++ RAII object for creating temporary files.
+
+#ifdef __cplusplus
+class FuzzerTemporaryFile {
+ public:
+  FuzzerTemporaryFile(const uint8_t* data, size_t size)
+      : original_filename_(fuzzer_get_tmpfile(data, size)) {
+    filename_ = strdup(original_filename_);
+    if (!filename_) {
+      perror("Failed to allocate file name copy.");
+      abort();
+    }
+  }
+
+  FuzzerTemporaryFile(const uint8_t* data, size_t size, const char* suffix)
+      : original_filename_(fuzzer_get_tmpfile_with_suffix(data, size, suffix)) {
+    filename_ = strdup(original_filename_);
+    if (!filename_) {
+      perror("Failed to allocate file name copy.");
+      abort();
+    }
+  }
+
+  ~FuzzerTemporaryFile() {
+    free(filename_);
+    fuzzer_release_tmpfile(original_filename_);
+  }
+
+  FuzzerTemporaryFile(const FuzzerTemporaryFile& other) = delete;
+  FuzzerTemporaryFile operator=(const FuzzerTemporaryFile& other) = delete;
+
+  FuzzerTemporaryFile(const FuzzerTemporaryFile&& other) = delete;
+  FuzzerTemporaryFile operator=(const FuzzerTemporaryFile&& other) = delete;
+
+  const char* filename() const { return filename_; }
+
+  // Returns a mutable pointer to the file name. Should be used sparingly, only
+  // in case the fuzzed API demands it or when making a mutable copy is
+  // inconvenient (e.g., in auto-generated code).
+  char* mutable_filename() const { return filename_; }
+
+ private:
+  char* original_filename_;
+
+  // A mutable copy of the original filename, returned by the accessor. This
+  // guarantees that the original filename can always be used to release the
+  // temporary path.
+  char* filename_;
+};
+#endif  // __cplusplus
+#endif  // LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
diff --git a/tests/fuzzer/obu_parser_fuzzer.cc b/tests/fuzzer/obu_parser_fuzzer.cc
new file mode 100644 (file)
index 0000000..f71ca17
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/obu_parser.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames and obus to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+inline void ParseObu(const uint8_t* const data, size_t size) {
+  size_t av1c_size;
+  const std::unique_ptr<uint8_t[]> av1c_box =
+      libgav1::ObuParser::GetAV1CodecConfigurationBox(data, size, &av1c_size);
+  static_cast<void>(av1c_box);
+
+  libgav1::InternalFrameBufferList buffer_list;
+  libgav1::BufferPool buffer_pool(libgav1::OnInternalFrameBufferSizeChanged,
+                                  libgav1::GetInternalFrameBuffer,
+                                  libgav1::ReleaseInternalFrameBuffer,
+                                  &buffer_list);
+  libgav1::DecoderState decoder_state;
+  libgav1::ObuParser parser(data, size, 0, &buffer_pool, &decoder_state);
+  libgav1::RefCountedBufferPtr current_frame;
+  int parsed_frames = 0;
+  while (parser.HasData()) {
+    if (parser.ParseOneFrame(&current_frame) != libgav1::kStatusOk) break;
+    if (++parsed_frames >= kMaxFrames) break;
+  }
+}
+
+}  // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  // Treat the input as a raw OBU stream.
+  ParseObu(data, size);
+
+  // Use the first frame from an IVF to bypass any read errors from the parser.
+  static constexpr size_t kIvfHeaderSize =
+      libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+  if (size >= kIvfHeaderSize) {
+    ParseObu(data + kIvfHeaderSize, size - kIvfHeaderSize);
+  }
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  std::vector<uint8_t> buffer;
+  int parsed_frames = 0;
+  do {
+    if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+    ParseObu(buffer.data(), buffer.size());
+    if (++parsed_frames >= kMaxFrames) break;
+  } while (!file_reader->IsEndOfFile());
+
+  return 0;
+}
diff --git a/tests/libgav1_tests.cmake b/tests/libgav1_tests.cmake
new file mode 100644 (file)
index 0000000..95f6361
--- /dev/null
@@ -0,0 +1,1366 @@
+# Copyright 2020 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_LIBGAV1_TESTS_CMAKE_)
+  return()
+endif() # LIBGAV1_LIBGAV1_TESTS_CMAKE_
+set(LIBGAV1_LIBGAV1_TESTS_CMAKE_ 1)
+
+set(libgav1_googletest "${libgav1_root}/third_party/googletest")
+if(NOT LIBGAV1_ENABLE_TESTS OR NOT EXISTS "${libgav1_googletest}")
+  macro(libgav1_add_tests_targets)
+
+  endmacro()
+
+  if(LIBGAV1_ENABLE_TESTS AND NOT EXISTS "${libgav1_googletest}")
+    message(
+      "GoogleTest not found, setting LIBGAV1_ENABLE_TESTS to false.\n"
+      "To enable tests download the GoogleTest repository to"
+      " third_party/googletest:\n\n  git \\\n    -C ${libgav1_root} \\\n"
+      "    clone -b release-1.12.1 --depth 1 \\\n"
+      "    https://github.com/google/googletest.git third_party/googletest\n")
+    set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+  endif()
+  return()
+endif()
+
+# Check GoogleTest compiler requirements.
+if((CMAKE_CXX_COMPILER_ID
+    MATCHES
+    "Clang|GNU"
+    AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5")
+   OR (MSVC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "19"))
+  macro(libgav1_add_tests_targets)
+
+  endmacro()
+
+  message(
+    WARNING
+      "${CMAKE_CXX_COMPILER} (${CMAKE_CXX_COMPILER_ID} version"
+      " ${CMAKE_CXX_COMPILER_VERSION}) is below the minimum requirements for"
+      " GoogleTest; disabling unit tests. See"
+      " https://github.com/google/googletest#compilers for more detail.")
+  set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+  return()
+endif()
+
+list(APPEND libgav1_tests_block_utils_sources
+            "${libgav1_root}/tests/block_utils.h"
+            "${libgav1_root}/tests/block_utils.cc")
+
+list(APPEND libgav1_tests_utils_sources
+            "${libgav1_root}/tests/third_party/libvpx/acm_random.h"
+            "${libgav1_root}/tests/third_party/libvpx/md5_helper.h"
+            "${libgav1_root}/tests/third_party/libvpx/md5_utils.cc"
+            "${libgav1_root}/tests/third_party/libvpx/md5_utils.h"
+            "${libgav1_root}/tests/utils.h" "${libgav1_root}/tests/utils.cc")
+
+list(APPEND libgav1_tests_utils_test_sources
+            "${libgav1_root}/tests/utils_test.cc")
+
+list(APPEND libgav1_array_2d_test_sources
+            "${libgav1_source}/utils/array_2d_test.cc")
+list(APPEND libgav1_average_blend_test_sources
+            "${libgav1_source}/dsp/average_blend_test.cc")
+list(APPEND libgav1_block_parameters_holder_test_sources
+            "${libgav1_source}/utils/block_parameters_holder_test.cc")
+list(APPEND libgav1_blocking_counter_test_sources
+            "${libgav1_source}/utils/blocking_counter_test.cc")
+list(APPEND libgav1_buffer_pool_test_sources
+            "${libgav1_source}/buffer_pool_test.cc")
+list(APPEND libgav1_cdef_test_sources "${libgav1_source}/dsp/cdef_test.cc")
+list(
+  APPEND libgav1_common_test_sources "${libgav1_source}/utils/common_test.cc")
+list(APPEND libgav1_common_avx2_test_sources
+            "${libgav1_source}/dsp/x86/common_avx2.h"
+            "${libgav1_source}/dsp/x86/common_avx2.inc"
+            "${libgav1_source}/dsp/x86/common_avx2_test.cc"
+            "${libgav1_source}/dsp/x86/common_avx2_test.h"
+            "${libgav1_source}/dsp/x86/common_sse4.inc")
+list(APPEND libgav1_common_dsp_test_sources
+            "${libgav1_source}/dsp/common_dsp_test.cc")
+list(APPEND libgav1_common_neon_test_sources
+            "${libgav1_source}/dsp/arm/common_neon_test.cc")
+list(APPEND libgav1_common_sse4_test_sources
+            "${libgav1_source}/dsp/x86/common_sse4.h"
+            "${libgav1_source}/dsp/x86/common_sse4.inc"
+            "${libgav1_source}/dsp/x86/common_sse4_test.cc"
+            "${libgav1_source}/dsp/x86/common_sse4_test.h")
+list(APPEND libgav1_convolve_test_sources
+            "${libgav1_source}/dsp/convolve_test.cc")
+list(APPEND libgav1_cpu_test_sources "${libgav1_source}/utils/cpu_test.cc")
+list(APPEND libgav1_c_decoder_test_sources
+            "${libgav1_source}/c_decoder_test.c"
+            "${libgav1_source}/decoder_test_data.h")
+list(APPEND libgav1_c_version_test_sources "${libgav1_source}/c_version_test.c")
+list(APPEND libgav1_decoder_test_sources
+            "${libgav1_source}/decoder_test.cc"
+            "${libgav1_source}/decoder_test_data.h")
+list(APPEND libgav1_decoder_buffer_test_sources
+            "${libgav1_source}/decoder_buffer_test.cc")
+list(APPEND libgav1_distance_weighted_blend_test_sources
+            "${libgav1_source}/dsp/distance_weighted_blend_test.cc")
+list(APPEND libgav1_dsp_test_sources "${libgav1_source}/dsp/dsp_test.cc")
+list(APPEND libgav1_entropy_decoder_test_sources
+            "${libgav1_source}/utils/entropy_decoder_test.cc"
+            "${libgav1_source}/utils/entropy_decoder_test_data.inc")
+list(APPEND libgav1_file_reader_test_sources
+            "${libgav1_examples}/file_reader_test.cc"
+            "${libgav1_examples}/file_reader_test_common.cc"
+            "${libgav1_examples}/file_reader_test_common.h")
+list(APPEND libgav1_film_grain_test_sources
+            "${libgav1_source}/film_grain_test.cc")
+list(APPEND libgav1_file_reader_factory_test_sources
+            "${libgav1_examples}/file_reader_factory_test.cc")
+list(APPEND libgav1_file_writer_test_sources
+            "${libgav1_examples}/file_writer_test.cc")
+list(APPEND libgav1_internal_frame_buffer_list_test_sources
+            "${libgav1_source}/internal_frame_buffer_list_test.cc")
+list(APPEND libgav1_intra_edge_test_sources
+            "${libgav1_source}/dsp/intra_edge_test.cc")
+list(APPEND libgav1_intrapred_cfl_test_sources
+            "${libgav1_source}/dsp/intrapred_cfl_test.cc")
+list(APPEND libgav1_intrapred_directional_test_sources
+            "${libgav1_source}/dsp/intrapred_directional_test.cc")
+list(APPEND libgav1_intrapred_filter_test_sources
+            "${libgav1_source}/dsp/intrapred_filter_test.cc")
+list(APPEND libgav1_intrapred_test_sources
+            "${libgav1_source}/dsp/intrapred_test.cc")
+list(APPEND libgav1_inverse_transform_test_sources
+            "${libgav1_source}/dsp/inverse_transform_test.cc")
+list(APPEND libgav1_loop_filter_test_sources
+            "${libgav1_source}/dsp/loop_filter_test.cc")
+list(APPEND libgav1_loop_restoration_test_sources
+            "${libgav1_source}/dsp/loop_restoration_test.cc")
+list(APPEND libgav1_mask_blend_test_sources
+            "${libgav1_source}/dsp/mask_blend_test.cc")
+list(APPEND libgav1_motion_field_projection_test_sources
+            "${libgav1_source}/dsp/motion_field_projection_test.cc")
+list(APPEND libgav1_motion_vector_search_test_sources
+            "${libgav1_source}/dsp/motion_vector_search_test.cc")
+list(APPEND libgav1_super_res_test_sources
+            "${libgav1_source}/dsp/super_res_test.cc")
+list(APPEND libgav1_weight_mask_test_sources
+            "${libgav1_source}/dsp/weight_mask_test.cc")
+list(
+  APPEND libgav1_memory_test_sources "${libgav1_source}/utils/memory_test.cc")
+list(APPEND libgav1_obmc_test_sources "${libgav1_source}/dsp/obmc_test.cc")
+list(APPEND libgav1_obu_parser_test_sources
+            "${libgav1_source}/obu_parser_test.cc")
+list(APPEND libgav1_post_filter_test_sources
+            "${libgav1_source}/post_filter_test.cc")
+list(APPEND libgav1_prediction_mask_test_sources
+            "${libgav1_source}/prediction_mask_test.cc")
+list(
+  APPEND libgav1_quantizer_test_sources "${libgav1_source}/quantizer_test.cc")
+list(APPEND libgav1_queue_test_sources "${libgav1_source}/utils/queue_test.cc")
+list(APPEND libgav1_raw_bit_reader_test_sources
+            "${libgav1_source}/utils/raw_bit_reader_test.cc")
+list(APPEND libgav1_reconstruction_test_sources
+            "${libgav1_source}/reconstruction_test.cc")
+list(APPEND libgav1_residual_buffer_pool_test_sources
+            "${libgav1_source}/residual_buffer_pool_test.cc")
+list(APPEND libgav1_scan_test_sources "${libgav1_source}/scan_test.cc")
+list(APPEND libgav1_segmentation_map_test_sources
+            "${libgav1_source}/utils/segmentation_map_test.cc")
+list(APPEND libgav1_segmentation_test_sources
+            "${libgav1_source}/utils/segmentation_test.cc")
+list(APPEND libgav1_stack_test_sources "${libgav1_source}/utils/stack_test.cc")
+list(APPEND libgav1_symbol_decoder_context_test_sources
+            "${libgav1_source}/symbol_decoder_context_test.cc")
+list(APPEND libgav1_threadpool_test_sources
+            "${libgav1_source}/utils/threadpool_test.cc")
+list(APPEND libgav1_threading_strategy_test_sources
+            "${libgav1_source}/threading_strategy_test.cc")
+list(APPEND libgav1_unbounded_queue_test_sources
+            "${libgav1_source}/utils/unbounded_queue_test.cc")
+list(
+  APPEND libgav1_vector_test_sources "${libgav1_source}/utils/vector_test.cc")
+list(APPEND libgav1_version_test_sources "${libgav1_source}/version_test.cc")
+list(APPEND libgav1_warp_test_sources "${libgav1_source}/dsp/warp_test.cc")
+list(APPEND libgav1_warp_prediction_test_sources
+            "${libgav1_source}/warp_prediction_test.cc")
+
+macro(libgav1_add_tests_targets)
+  if(NOT LIBGAV1_ENABLE_TESTS)
+    message(
+      FATAL_ERROR
+        "This version of libgav1_add_tests_targets() should only be used with"
+        " LIBGAV1_ENABLE_TESTS set to true.")
+  endif()
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_gtest
+                      TYPE
+                      STATIC
+                      SOURCES
+                      "${libgav1_googletest}/googletest/src/gtest-all.cc"
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_gtest_include_paths}
+                      ${libgav1_include_paths})
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_gtest_main
+                      TYPE
+                      STATIC
+                      SOURCES
+                      "${libgav1_googletest}/googletest/src/gtest_main.cc"
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_gtest_include_paths}
+                      ${libgav1_include_paths})
+
+  if(use_absl_threading)
+    list(APPEND libgav1_common_test_absl_deps absl::synchronization)
+  endif()
+
+  libgav1_add_executable(TEST
+                         NAME
+                         array_2d_test
+                         SOURCES
+                         ${libgav1_array_2d_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         block_parameters_holder_test
+                         SOURCES
+                         ${libgav1_block_parameters_holder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         blocking_counter_test
+                         SOURCES
+                         ${libgav1_blocking_counter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  if(libgav1_have_avx2)
+    list(APPEND libgav1_common_dsp_test_sources
+                ${libgav1_common_avx2_test_sources})
+  endif()
+  if(libgav1_have_sse4)
+    list(APPEND libgav1_common_dsp_test_sources
+                ${libgav1_common_sse4_test_sources})
+  endif()
+  if(libgav1_have_avx2 OR libgav1_have_sse4)
+    libgav1_add_executable(TEST
+                           NAME
+                           common_dsp_test
+                           SOURCES
+                           ${libgav1_common_dsp_test_sources}
+                           DEFINES
+                           ${libgav1_defines}
+                           INCLUDES
+                           ${libgav1_test_include_paths}
+                           OBJLIB_DEPS
+                           libgav1_utils
+                           LIB_DEPS
+                           ${libgav1_common_test_absl_deps}
+                           libgav1_gtest_main
+                           libgav1_gtest)
+  endif()
+
+  if(libgav1_have_neon)
+    libgav1_add_executable(TEST
+                           NAME
+                           common_neon_test
+                           SOURCES
+                           ${libgav1_common_neon_test_sources}
+                           DEFINES
+                           ${libgav1_defines}
+                           INCLUDES
+                           ${libgav1_test_include_paths}
+                           OBJLIB_DEPS
+                           libgav1_tests_block_utils
+                           libgav1_utils
+                           LIB_DEPS
+                           ${libgav1_common_test_absl_deps}
+                           libgav1_gtest
+                           libgav1_gtest_main)
+  endif()
+
+  libgav1_add_executable(TEST
+                         NAME
+                         common_test
+                         SOURCES
+                         ${libgav1_common_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         cpu_test
+                         SOURCES
+                         ${libgav1_cpu_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         entropy_decoder_test
+                         SOURCES
+                         ${libgav1_entropy_decoder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         file_reader_test
+                         SOURCES
+                         ${libgav1_file_reader_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_dsp
+                         libgav1_file_reader
+                         libgav1_utils
+                         libgav1_tests_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         file_reader_factory_test
+                         SOURCES
+                         ${libgav1_file_reader_factory_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_file_reader
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::memory
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         film_grain_test
+                         SOURCES
+                         ${libgav1_film_grain_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         memory_test
+                         SOURCES
+                         ${libgav1_memory_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         absl::base
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         queue_test
+                         SOURCES
+                         ${libgav1_queue_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         segmentation_map_test
+                         SOURCES
+                         ${libgav1_segmentation_map_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         segmentation_test
+                         SOURCES
+                         ${libgav1_segmentation_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         stack_test
+                         SOURCES
+                         ${libgav1_stack_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         symbol_decoder_context_test
+                         SOURCES
+                         ${libgav1_symbol_decoder_context_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         threadpool_test
+                         SOURCES
+                         ${libgav1_threadpool_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::synchronization
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         unbounded_queue_test
+                         SOURCES
+                         ${libgav1_unbounded_queue_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         tests_utils_test
+                         SOURCES
+                         ${libgav1_tests_utils_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         vector_test
+                         SOURCES
+                         ${libgav1_vector_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         version_test
+                         SOURCES
+                         ${libgav1_version_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency}
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_tests_block_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_tests_block_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_test_include_paths})
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_tests_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_tests_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_test_include_paths})
+
+  libgav1_add_executable(TEST
+                         NAME
+                         average_blend_test
+                         SOURCES
+                         ${libgav1_average_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         buffer_pool_test
+                         SOURCES
+                         ${libgav1_buffer_pool_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         cdef_test
+                         SOURCES
+                         ${libgav1_cdef_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         convolve_test
+                         SOURCES
+                         ${libgav1_convolve_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         c_decoder_test
+                         SOURCES
+                         ${libgav1_c_decoder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency})
+
+  libgav1_add_executable(TEST
+                         NAME
+                         c_version_test
+                         SOURCES
+                         ${libgav1_c_version_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency})
+
+  libgav1_add_executable(TEST
+                         NAME
+                         decoder_test
+                         SOURCES
+                         ${libgav1_decoder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency}
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         decoder_buffer_test
+                         SOURCES
+                         ${libgav1_decoder_buffer_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency}
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         distance_weighted_blend_test
+                         SOURCES
+                         ${libgav1_distance_weighted_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         dsp_test
+                         SOURCES
+                         ${libgav1_dsp_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         file_writer_test
+                         SOURCES
+                         ${libgav1_file_writer_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_file_writer
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::memory
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_cfl_test
+                         SOURCES
+                         ${libgav1_intrapred_cfl_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_directional_test
+                         SOURCES
+                         ${libgav1_intrapred_directional_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_filter_test
+                         SOURCES
+                         ${libgav1_intrapred_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_test
+                         SOURCES
+                         ${libgav1_intrapred_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intra_edge_test
+                         SOURCES
+                         ${libgav1_intra_edge_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_tests_utils
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         inverse_transform_test
+                         SOURCES
+                         ${libgav1_inverse_transform_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         internal_frame_buffer_list_test
+                         SOURCES
+                         ${libgav1_internal_frame_buffer_list_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         loop_filter_test
+                         SOURCES
+                         ${libgav1_loop_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         loop_restoration_test
+                         SOURCES
+                         ${libgav1_loop_restoration_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         mask_blend_test
+                         SOURCES
+                         ${libgav1_mask_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         motion_field_projection_test
+                         SOURCES
+                         ${libgav1_motion_field_projection_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         motion_vector_search_test
+                         SOURCES
+                         ${libgav1_motion_vector_search_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         obmc_test
+                         SOURCES
+                         ${libgav1_obmc_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         obu_parser_test
+                         SOURCES
+                         ${libgav1_obu_parser_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         post_filter_test
+                         SOURCES
+                         ${libgav1_post_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         prediction_mask_test
+                         SOURCES
+                         ${libgav1_prediction_mask_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         quantizer_test
+                         SOURCES
+                         ${libgav1_quantizer_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         raw_bit_reader_test
+                         SOURCES
+                         ${libgav1_raw_bit_reader_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         reconstruction_test
+                         SOURCES
+                         ${libgav1_reconstruction_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         absl::strings
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         residual_buffer_pool_test
+                         SOURCES
+                         ${libgav1_residual_buffer_pool_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         scan_test
+                         SOURCES
+                         ${libgav1_scan_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         super_res_test
+                         SOURCES
+                         ${libgav1_super_res_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         threading_strategy_test
+                         SOURCES
+                         ${libgav1_threading_strategy_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         absl::str_format_internal
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         warp_test
+                         SOURCES
+                         ${libgav1_warp_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         warp_prediction_test
+                         SOURCES
+                         ${libgav1_warp_prediction_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         weight_mask_test
+                         SOURCES
+                         ${libgav1_weight_mask_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+endmacro()
diff --git a/tests/third_party/libvpx/LICENSE b/tests/third_party/libvpx/LICENSE
new file mode 100644 (file)
index 0000000..83ef339
--- /dev/null
@@ -0,0 +1,30 @@
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google, nor the WebM Project, nor the names
+    of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/tests/third_party/libvpx/acm_random.h b/tests/third_party/libvpx/acm_random.h
new file mode 100644 (file)
index 0000000..e8cfc9c
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+namespace libvpx_test {
+
+class ACMRandom {
+ public:
+  ACMRandom() : random_(DeterministicSeed()) {}
+
+  explicit ACMRandom(int seed) : random_(seed) {}
+
+  void Reset(int seed) { random_.Reseed(seed); }
+  uint16_t Rand16(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 15) & 0xffff;
+  }
+
+  int32_t Rand20Signed(void) {
+    // Use 20 bits: values between 524287 and -524288.
+    const uint32_t value = random_.Generate(1048576);
+    return static_cast<int32_t>(value) - 524288;
+  }
+
+  int16_t Rand16Signed(void) {
+    // Use 16 bits: values between 32767 and -32768.
+    return static_cast<int16_t>(random_.Generate(65536));
+  }
+
+  int16_t Rand13Signed(void) {
+    // Use 13 bits: values between 4095 and -4096.
+    const uint32_t value = random_.Generate(8192);
+    return static_cast<int16_t>(value) - 4096;
+  }
+
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
+  uint8_t Rand8(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 23) & 0xff;
+  }
+
+  uint8_t Rand8Extremes(void) {
+    // Returns a random value near 0 or near 255, to better exercise
+    // saturation behavior.
+    const uint8_t r = Rand8();
+    return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+  }
+
+  uint32_t RandRange(const uint32_t range) {
+    // testing::internal::Random::Generate provides values in the range
+    // testing::internal::Random::kMaxRange.
+    assert(range <= testing::internal::Random::kMaxRange);
+    return random_.Generate(range);
+  }
+
+  int PseudoUniform(int range) { return random_.Generate(range); }
+
+  int operator()(int n) { return PseudoUniform(n); }
+
+  static constexpr int DeterministicSeed(void) { return 0xbaba; }
+
+ private:
+  testing::internal::Random random_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
diff --git a/tests/third_party/libvpx/md5_helper.h b/tests/third_party/libvpx/md5_helper.h
new file mode 100644 (file)
index 0000000..c97b590
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+namespace libvpx_test {
+class MD5 {
+ public:
+  MD5() { MD5Init(&md5_); }
+
+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
+  const char *Get(void) {
+    static const char hex[16] = {
+        '0', '1', '2', '3', '4', '5', '6', '7',
+        '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+    };
+    uint8_t tmp[16];
+    MD5Context ctx_tmp = md5_;
+
+    MD5Final(tmp, &ctx_tmp);
+    for (int i = 0; i < 16; i++) {
+      res_[i * 2 + 0] = hex[tmp[i] >> 4];
+      res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+    }
+    res_[32] = 0;
+
+    return res_;
+  }
+
+ protected:
+  char res_[33];
+  MD5Context md5_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
diff --git a/tests/third_party/libvpx/md5_utils.cc b/tests/third_party/libvpx/md5_utils.cc
new file mode 100644 (file)
index 0000000..4638e54
--- /dev/null
@@ -0,0 +1,249 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+#include <cstring>
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+  md5byte *p;
+
+  /* Only swap bytes for big endian machines */
+  int i = 1;
+
+  if (*(char *)&i == 1) return;
+
+  p = (md5byte *)buf;
+
+  do {
+    *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+             ((unsigned)p[1] << 8 | p[0]);
+    p += 4;
+  } while (--words);
+}
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+  ctx->buf[0] = 0x67452301;
+  ctx->buf[1] = 0xefcdab89;
+  ctx->buf[2] = 0x98badcfe;
+  ctx->buf[3] = 0x10325476;
+
+  ctx->bytes[0] = 0;
+  ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+  UWORD32 t;
+
+  /* Update byte count */
+
+  t = ctx->bytes[0];
+
+  if ((ctx->bytes[0] = t + len) < t)
+    ctx->bytes[1]++; /* Carry from low to high */
+
+  t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+  if (t > len) {
+    memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+    return;
+  }
+
+  /* First chunk is an odd size */
+  memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+  byteSwap(ctx->in, 16);
+  MD5Transform(ctx->buf, ctx->in);
+  buf += t;
+  len -= t;
+
+  /* Process data in 64-byte chunks */
+  while (len >= 64) {
+    memcpy(ctx->in, buf, 64);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    buf += 64;
+    len -= 64;
+  }
+
+  /* Handle any remaining bytes of data. */
+  memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+  int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+  md5byte *p = (md5byte *)ctx->in + count;
+
+  /* Set the first char of padding to 0x80.  There is always room. */
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 56 bytes (-8..55) */
+  count = 56 - 1 - count;
+
+  if (count < 0) { /* Padding forces an extra block */
+    memset(p, 0, count + 8);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    p = (md5byte *)ctx->in;
+    count = 56;
+  }
+
+  memset(p, 0, count);
+  byteSwap(ctx->in, 14);
+
+  /* Append length in bits and transform */
+  ctx->in[14] = ctx->bytes[0] << 3;
+  ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+  MD5Transform(ctx->buf, ctx->in);
+
+  byteSwap(ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+  memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+  (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+                                                 UWORD32 const in[16]) {
+  UWORD32 a, b, c, d;
+
+  a = buf[0];
+  b = buf[1];
+  c = buf[2];
+  d = buf[3];
+
+  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
+#endif
diff --git a/tests/third_party/libvpx/md5_utils.h b/tests/third_party/libvpx/md5_utils.h
new file mode 100644 (file)
index 0000000..13be035
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+  UWORD32 buf[4];
+  UWORD32 bytes[2];
+  UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
diff --git a/tests/utils.cc b/tests/utils.cc
new file mode 100644 (file)
index 0000000..e91ea87
--- /dev/null
@@ -0,0 +1,197 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+int CloseFile(FILE* stream) { return fclose(stream); }
+
+bool ReadFileToString(absl::string_view file_name, std::string* const string) {
+  using FilePtr = std::unique_ptr<FILE, decltype(&CloseFile)>;
+  FilePtr file(fopen(std::string(file_name).c_str(), "rb"), &CloseFile);
+  if (file == nullptr) return false;
+
+  do {
+    int c = fgetc(file.get());
+    if (ferror(file.get()) != 0) return false;
+
+    if (c != EOF) {
+      string->append(1, static_cast<char>(c));
+    } else {
+      break;
+    }
+  } while (true);
+
+  return true;
+}
+
+}  // namespace
+
+void ResetDspTable(const int bitdepth) {
+  dsp::Dsp* const dsp = dsp_internal::GetWritableDspTable(bitdepth);
+  ASSERT_NE(dsp, nullptr);
+  memset(dsp, 0, sizeof(dsp::Dsp));
+}
+
+std::string GetMd5Sum(const void* bytes, size_t size) {
+  libvpx_test::MD5 md5;
+  md5.Add(static_cast<const uint8_t*>(bytes), size);
+  return md5.Get();
+}
+
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride) {
+  libvpx_test::MD5 md5;
+  const Pixel* row = block;
+  for (int i = 0; i < height; ++i) {
+    md5.Add(reinterpret_cast<const uint8_t*>(row), width * sizeof(Pixel));
+    row += stride;
+  }
+  return md5.Get();
+}
+
+template std::string GetMd5Sum(const int8_t* block, int width, int height,
+                               int stride);
+template std::string GetMd5Sum(const int16_t* block, int width, int height,
+                               int stride);
+
+std::string GetMd5Sum(const DecoderBuffer& buffer) {
+  libvpx_test::MD5 md5;
+  const size_t pixel_size =
+      (buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  for (int plane = kPlaneY; plane < buffer.NumPlanes(); ++plane) {
+    const int height = buffer.displayed_height[plane];
+    const size_t width = buffer.displayed_width[plane] * pixel_size;
+    const int stride = buffer.stride[plane];
+    const uint8_t* plane_buffer = buffer.plane[plane];
+    for (int row = 0; row < height; ++row) {
+      md5.Add(plane_buffer, width);
+      plane_buffer += stride;
+    }
+  }
+  return md5.Get();
+}
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const void* data, size_t size,
+                    absl::Duration elapsed_time) {
+  const std::string digest = test_utils::GetMd5Sum(data, size);
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         digest.c_str());
+  EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const Pixel* block, int width,
+                    int height, int stride, absl::Duration elapsed_time) {
+  const std::string digest =
+      test_utils::GetMd5Sum(block, width, height, stride);
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         digest.c_str());
+  EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template void CheckMd5Digest(const char name[], const char function_name[],
+                             const char expected_digest[], const int8_t* block,
+                             int width, int height, int stride,
+                             absl::Duration elapsed_time);
+template void CheckMd5Digest(const char name[], const char function_name[],
+                             const char expected_digest[], const int16_t* block,
+                             int width, int height, int stride,
+                             absl::Duration elapsed_time);
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const char actual_digest[],
+                    absl::Duration elapsed_time) {
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         actual_digest);
+  EXPECT_STREQ(expected_digest, actual_digest);
+}
+
+namespace {
+
+std::string GetSourceDir() {
+#if defined(__ANDROID__)
+  // Test files must be manually supplied. This path is frequently
+  // available on development devices.
+  return std::string("/data/local/tmp/tests/data");
+#elif defined(LIBGAV1_FLAGS_SRCDIR)
+  return std::string(LIBGAV1_FLAGS_SRCDIR) + "/tests/data";
+#else
+  return std::string(".");
+#endif  // defined(__ANDROID__)
+}
+
+std::string GetTempDir() {
+  const char* path = getenv("TMPDIR");
+  if (path == nullptr || path[0] == '\0') path = getenv("TEMP");
+  if (path != nullptr && path[0] != '\0') return std::string(path);
+
+#if defined(__ANDROID__)
+  return std::string("/data/local/tmp");
+#elif defined(LIBGAV1_FLAGS_TMPDIR)
+  return std::string(LIBGAV1_FLAGS_TMPDIR);
+#else
+  return std::string(".");
+#endif  // defined(__ANDROID__)
+}
+
+}  // namespace
+
+std::string GetTestInputFilePath(absl::string_view file_name) {
+  const char* const path = getenv("LIBGAV1_TEST_DATA_PATH");
+  if (path != nullptr && path[0] != '\0') {
+    return std::string(path) + "/" + std::string(file_name);
+  }
+  return GetSourceDir() + "/" + std::string(file_name);
+}
+
+std::string GetTestOutputFilePath(absl::string_view file_name) {
+  return GetTempDir() + "/" + std::string(file_name);
+}
+
+void GetTestData(absl::string_view file_name, const bool is_output_file,
+                 std::string* const output) {
+  ASSERT_NE(output, nullptr);
+  const std::string absolute_file_path = is_output_file
+                                             ? GetTestOutputFilePath(file_name)
+                                             : GetTestInputFilePath(file_name);
+
+  ASSERT_TRUE(ReadFileToString(absolute_file_path, output));
+}
+
+}  // namespace test_utils
+}  // namespace libgav1
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644 (file)
index 0000000..3394d64
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_UTILS_H_
+#define LIBGAV1_TESTS_UTILS_H_
+
+#include <cstddef>
+#include <new>
+#include <string>
+
+#include "absl/base/config.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+
+enum { kAlternateDeterministicSeed = 0x9571 };
+static_assert(kAlternateDeterministicSeed !=
+                  libvpx_test::ACMRandom::DeterministicSeed(),
+              "");
+
+// Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions
+// of new to support googletest allocations.
+// Note when building the source as C++17 or greater, gcc 11.2.0 may issue a
+// warning of the form:
+//   warning: 'void operator delete [](void*, std::align_val_t)' called on
+//     pointer returned from a mismatched allocation function
+//   note: returned from 'static void*
+//     libgav1::test_utils::MaxAlignedAllocable::operator new [](size_t)'
+// This is a false positive as this function calls
+// libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow) which in
+// turn calls
+// void* operator new[](std::size_t, std::align_val_t, const std::nothrow_t&).
+// This is due to unbalanced inlining of the functions, so we force them to be
+// inlined.
+// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103993
+struct MaxAlignedAllocable {
+  // Class-specific allocation functions.
+  static LIBGAV1_ALWAYS_INLINE void* operator new(size_t size) {
+    void* const p =
+        libgav1::MaxAlignedAllocable::operator new(size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+    if (p == nullptr) throw std::bad_alloc();
+#endif
+    return p;
+  }
+  static LIBGAV1_ALWAYS_INLINE void* operator new[](size_t size) {
+    void* const p =
+        libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+    if (p == nullptr) throw std::bad_alloc();
+#endif
+    return p;
+  }
+
+  // Class-specific non-throwing allocation functions
+  static LIBGAV1_ALWAYS_INLINE void* operator new(
+      size_t size, const std::nothrow_t& tag) noexcept {
+    return libgav1::MaxAlignedAllocable::operator new(size, tag);
+  }
+  static LIBGAV1_ALWAYS_INLINE void* operator new[](
+      size_t size, const std::nothrow_t& tag) noexcept {
+    return libgav1::MaxAlignedAllocable::operator new[](size, tag);
+  }
+
+  // Class-specific deallocation functions.
+  static LIBGAV1_ALWAYS_INLINE void operator delete(void* ptr) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete(ptr);
+  }
+  static LIBGAV1_ALWAYS_INLINE void operator delete[](void* ptr) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete[](ptr);
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static LIBGAV1_ALWAYS_INLINE void operator delete(
+      void* ptr, const std::nothrow_t& tag) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete(ptr, tag);
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static LIBGAV1_ALWAYS_INLINE void operator delete[](
+      void* ptr, const std::nothrow_t& tag) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete[](ptr, tag);
+  }
+};
+
+// Clears dsp table entries for |bitdepth|. This function is not thread safe.
+void ResetDspTable(int bitdepth);
+
+//------------------------------------------------------------------------------
+// Gets human readable hexadecimal encoded MD5 sum from given data, block, or
+// frame buffer.
+
+std::string GetMd5Sum(const void* bytes, size_t size);
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride);
+std::string GetMd5Sum(const DecoderBuffer& buffer);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |size| bytes of |data| with |expected_digest|.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const void* data, size_t size,
+                    absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |block| with |expected_digest|. The width, height,
+// and stride of |block| are |width|, |height|, and |stride|, respectively.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const Pixel* block, int width,
+                    int height, int stride, absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares |actual_digest| with |expected_digest|. Prints a log message with
+// |name|, |function_name|, md5 digest and |elapsed_time|. |name| and
+// |function_name| are merely tags used for logging and can be any meaningful
+// string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const char actual_digest[],
+                    absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Reads the test data from |file_name| as a string into |output|. The
+// |is_output_file| argument controls the expansion of |file_name| to its full
+// path. When |is_output_file| is true GetTestData() reads from
+// utils.cc::GetTempDir(), and when it is false the file is read from
+// utils.cc::GetSourceDir().
+void GetTestData(absl::string_view file_name, bool is_output_file,
+                 std::string* output);
+
+//------------------------------------------------------------------------------
+// Returns the full path to |file_name| from libgav1/tests/data.
+std::string GetTestInputFilePath(absl::string_view file_name);
+
+//------------------------------------------------------------------------------
+// Returns the full path to |file_name| in a location where the file can be
+// opened for writing.
+std::string GetTestOutputFilePath(absl::string_view file_name);
+
+}  // namespace test_utils
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TESTS_UTILS_H_
diff --git a/tests/utils_test.cc b/tests/utils_test.cc
new file mode 100644 (file)
index 0000000..1d5b598
--- /dev/null
@@ -0,0 +1,190 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+#include "src/utils/memory.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+// Has a trivial default constructor that performs no action.
+struct SmallMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x;
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct SmallMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x = 0;
+};
+
+// Has a trivial default constructor that performs no action.
+struct HugeMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct HugeMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1] = {};
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+  MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+  uint8_t x;
+};
+#endif
+
+TEST(TestUtilsTest, TestMaxAlignedAllocable) {
+  {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+    EXPECT_NE(small, nullptr);
+    // Note this check doesn't guarantee conformance as a suitably aligned
+    // address may be returned from any allocator.
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new is called.
+    std::unique_ptr<SmallMaxAligned> small(new SmallMaxAligned);
+    EXPECT_NE(small, nullptr);
+    // Note this check doesn't guarantee conformance as a suitably aligned
+    // address may be returned from any allocator.
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+        new (std::nothrow) SmallMaxAligned[10]);
+    EXPECT_NE(small_array_of_smalls, nullptr);
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                  (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete[] is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] is called.
+    std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+        new SmallMaxAligned[10]);
+    EXPECT_NE(small_array_of_smalls, nullptr);
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                  (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete[] is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+    EXPECT_EQ(huge, nullptr);
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+        new (std::nothrow)
+            SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+    EXPECT_EQ(huge_array_of_smalls, nullptr);
+  }
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete is called.
+    auto* always = new MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] is called.
+    auto* always = new MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  // Note these calls are only safe with exceptions enabled as if the throwing
+  // operator new returns the object is expected to be valid. In this case an
+  // attempt to invoke the object's constructor on a nullptr may be made which
+  // is undefined behavior.
+  try {
+    // MaxAlignedAllocable::operator new is called.
+    std::unique_ptr<HugeMaxAlignedNontrivialConstructor> huge(
+        new HugeMaxAlignedNontrivialConstructor);
+    ADD_FAILURE() << "huge allocation should fail.";
+  } catch (...) {
+    SUCCEED();
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] is called.
+    std::unique_ptr<SmallMaxAlignedNontrivialConstructor[]>
+        huge_array_of_smalls(
+            new SmallMaxAlignedNontrivialConstructor
+                [kMaxAllocableSize /
+                     sizeof(SmallMaxAlignedNontrivialConstructor) +
+                 1]);
+    ADD_FAILURE() << "huge_array_of_smalls allocation should fail.";
+  } catch (...) {
+    SUCCEED();
+  }
+#endif  // ABSL_HAVE_EXCEPTIONS
+}
+
+}  // namespace
+}  // namespace test_utils
+}  // namespace libgav1